def __init__(self):
        self.publications_count = 400
        self.name = 'Копирование'
        self.file_name = 'copy_publications'
        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.publication_table_columns = [
            'crawler__id',
            'crawler__name',
            'crawler__name_cyrillic',
            'title',
            'text',
            'date',
            'author',
        ]

        self.copypublication_table_columns = [
            'crawler_id',
            'title',
            'text',
            'date',
        ]
    def __init__(self):
        self.name = 'Поиск синонимов'
        self.file_name = 'links_synonims'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 40
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.finded_synonims = None
Exemple #3
0
    def __init__(self):
        self.publications_count = 400
        self.name = 'Создание хешей публикаций'
        self.file_name = 'make_hashes'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.vocabulary = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
    def __init__(self):
        self.name = 'Поиск некорректных слов'
        self.file_name = 'incorrect_word_selection'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 10
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.words_checked_count = None
Exemple #5
0
    def __init__(self):
        self.pub_without_status_length = 100
        self.retrospective_days_delta = 10
        self.name = 'Поиск нечетких дубликатов'
        self.file_name = 'pubcompare'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)
    def push(self, date):

        Base().connection()

        if date == None:
            publications = list(
                Publication.objects.using('manager').all().values(
                    *self.publication_table_columns).order_by('date')
                [:self.publications_count])
        else:
            publications = list(
                Publication.objects.using('manager').filter(
                    date__gte=date).values(*self.publication_table_columns).
                order_by('date')[:self.publications_count])

        # убираем дубли, если они существуют в manager.Publication
        publications = self.__remove_doubles(publications)

        # убираем дубли, если они существуют в canonizator.PublicationCopy
        if date != None:

            copypublications = CopyPublication.objects.filter(
                date__gte=date -
                timedelta(days=1)).values(*self.copypublication_table_columns)

            publications = self.__remove_doubles_by_copypublication_table(
                publications, copypublications)

        # записываем в CopyPublication publications_filtered
        copypublications = []

        for publication in publications:
            copypublications.append(
                CopyPublication(
                    crawler_id=publication['crawler__id'],
                    name=publication['crawler__name'],
                    name_cyrillic=publication['crawler__name_cyrillic'],
                    title=publication['title'],
                    text=publication['text'],
                    date=publication['date'],
                    author=publication['author'],
                ))

        count = len(copypublications)

        if count > 0:

            Base().connection()

            CopyPublication.objects.bulk_create(copypublications)

        self.save_status(count)
Exemple #7
0
    def __replace_synonims(self, vocabulary):

        Base().connection()

        if 'None' in vocabulary:
            del vocabulary['None']

        for key, words in vocabulary.items():

            pos_words = self.vocabulary[key].objects.filter(
                crc32__in=words).values('id', 'parent_id', 'crc32')

            parent_ids = []

            for pos_word in pos_words:
                parent_ids.append(pos_word['parent_id'])

            pos_parents = self.vocabulary[key].objects.filter(
                id__in=parent_ids).values('id', 'crc32')

            result = []

            for pos_word in pos_words:
                result_line = {}
                for pos_parent in pos_parents:
                    if pos_word['parent_id'] == pos_parent['id']:
                        result_line['word_parent'] = pos_parent['crc32']
                result_line['word'] = pos_word['crc32']
                result.append(result_line)

            vocabulary[key] = result
    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = VocabularyError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))
    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = NormalizePublicationError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))
Exemple #10
0
    def start(self):

        Base().connection()

        packet = NormalizePublication.objects.filter(
            title_hashes={}).order_by('pubdate')[:self.publications_count]

        # запрашиваем все слова
        vocabulary = self.__get_all_words(packet)

        # подтягиваем синонимы
        self.__replace_synonims(vocabulary)

        result = []

        for line in packet:

            result_line = {}

            title = self.__hash_list(line.title.split(' '))
            text = self.__hash_list(line.text.split(' '))

            result_line['title_hash'] = title
            result_line['text_hash'] = text

            # цепляем номера к заголовку
            result_line['title_words'] = self.__link_numbers(
                line.title_words, result_line['title_hash'], vocabulary)
            #складываем все слова
            result_line['title_words'] = self.__append_n_sort(
                result_line['title_words'])
            #создаем лист со словами
            result_line['title_words'] = self.__make_list_with_parents(
                result_line['title_words'])

            # цепляем номера к тексту
            result_line['text_words'] = self.__link_numbers(
                line.text_words, result_line['text_hash'], vocabulary)
            #складываем все слова
            result_line['text_words'] = self.__append_n_sort(
                result_line['text_words'])
            #создаем лист со словами
            result_line['text_words'] = self.__make_list_with_parents(
                result_line['text_words'])

            result.append({
                'id': line.id,
                'title_hashes': result_line['title_words'],
                'text_hashes': result_line['text_words'],
            })

        for line in packet:
            for result_line in result:
                if line.id == result_line['id']:
                    line.title_hashes = result_line['title_hashes']
                    line.text_hashes = result_line['text_hashes']

        bulk_update(packet)

        self.save_status(len(packet))
 def run_daemon(self):
     try:
         self.context.open()
         with self.context:
             while True:
                 Base().update_working_status(self, 'waiting')
                 can_program = Base().can_program(self)
                 if can_program:
                     Base().update_working_status(self, 'working')
                     self.start()
                     Base().update_working_status(self, 'waiting')
                     Base().update_pidfile(self)
                     time.sleep(300)
                 else:
                     time.sleep(300)
     except Exception:
         self.save_error()
 def get_last_error(self):
     Base().connection()
     try:
         max_date = NormalizePublicationError.objects.all().aggregate(
             Max('date'))['date__max']
         last_error = NormalizePublicationError.objects.get(date=max_date)
     except:
         last_error = 'no status'
     return last_error
    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        CopyPublicationStatus.objects.create(status=status, count=count)
    def get_date(self):

        Base().connection()

        try:
            date = CopyPublication.objects.all().aggregate( \
             Max('date'))['date__max']
        except:
            date = None
        return date
    def get_last_pcopy_id(self):

        Base().connection()

        try:
            last_pcopy = NormalizePublication.objects.all().aggregate( \
             Max('CopyPublication_id'))['CopyPublication_id__max']
        except:
            last_pcopy = None
        return last_pcopy
Exemple #16
0
    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        MakeHashesStatus.objects.create(status=status, count=count)
    def __remove_already_have(self, vocabulary):

        Base().connection()

        for key, value in vocabulary.items():
            doubles = self.voc_models[key].objects.filter(
                name__in=vocabulary[key]).values('name')
            for double in doubles:
                self.__remove_from_array_by_value(vocabulary[key],
                                                  double['name'])
    def get_last_error(self):

        Base().connection()

        try:
            max_date = VocabularyError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = VocabularyError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error
    def get_last_status(self):

        Base().connection()

        try:
            max_date = CopyPublicationStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = CopyPublicationStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status
    def save_status(self, count):

        Base().connection()

        if not count is None:
            status = 'Ok'
        else:
            status = 'Empty'
            count = 0

        VocabularyStatus.objects.create(status=status, count=count)
Exemple #21
0
    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        PubCompareStatus.objects.create(
            status=status,
            count=count,
        )
    def get_pcopy_list(self, last_pcopy):

        Base().connection()

        last_pcopy = self.get_last_pcopy_id()
        if last_pcopy != None:
            pcopy_list = CopyPublication.objects.filter(
                id__gt=last_pcopy).values(
                    *self.copypublication_fields)[:self.list_value]
        else:
            pcopy_list = CopyPublication.objects.all().values(
                *self.copypublication_fields)[:self.list_value]
        return pcopy_list
    def __add_vocabulary_to_db(self, vocabulary):

        Base().connection()

        for key in vocabulary:
            words = []
            for word in vocabulary[key]:
                words.append(self.voc_models[key](
                    name=word,
                    crc32=self.__convert_crc32(word),
                ))
            if len(words) > 0:
                now = timezone.now()
                self.voc_models[key].objects.bulk_create(words)
    def __remove_already_have_grammems_to_remove(self, grammems_to_remove):

        Base().connection()

        for key, value in grammems_to_remove.items():
            doubles = self.grammems_to_remove_models[key].objects.filter(
                crc32__in=[word['crc32'] for word in grammems_to_remove[key]
                           ]).values('crc32')

            for double in doubles:
                for key2, word in enumerate(value):
                    if word['crc32'] == double['crc32']:
                        del value[key2]
                        break
    def __add_vocabulary_grammems_to_remove_to_db(self, grammems_to_remove):

        Base().connection()

        for key in grammems_to_remove:
            words = []
            for word in grammems_to_remove[key]:
                words.append(self.grammems_to_remove_models[key](
                    name=word['word'],
                    crc32=word['crc32'],
                ))
            if len(words) > 0:
                now = timezone.now()
                self.grammems_to_remove_models[key].objects.bulk_create(words)
    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_scaned=False)[:self.list_value]

            if len(words) > 0:
                result = Vikidict().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.finded_synonims)

        self.finded_synonims = None
    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_correction_tested=False,
                Tone__isnull=True,
            )[:self.list_value]

            if len(words) > 0:
                result = VikidictCorr().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.words_checked_count)

        self.words_checked_count = None
    def save(self, normalized_list):

        Base().connection()

        normalized_publications = []
        for item in normalized_list:
            normalized_publications.append(
                NormalizePublication(
                    crawler_id=item['crawler_id'],
                    name=item['name'],
                    name_cyrillic=item['name_cyrillic'],
                    title=item['title'],
                    text=item['text'],
                    author=item['author'],
                    pubdate=item['date'],
                    CopyPublication_id=item['id'],
                    title_words=item['title_words'],
                    text_words=item['text_words'],
                ))
        count = len(normalized_publications)
        if count > 0:
            NormalizePublication.objects.bulk_create(normalized_publications)

        self.save_status(count)
    def clear_vocabulary(self):
        Base().connection()

        for key, value in self.voc_models.items():
            value.objects.all().delete()
class Program:
    def __init__(self):
        self.name = 'Поиск синонимов'
        self.file_name = 'links_synonims'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 40
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.finded_synonims = None

    def get_last_status(self):

        Base().connection()

        try:
            max_date = VocabularyStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = VocabularyStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = VocabularyError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = VocabularyError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = VocabularyError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if not count is None:
            status = 'Ok'
        else:
            status = 'Empty'
            count = 0

        VocabularyStatus.objects.create(status=status, count=count)

    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_scaned=False)[:self.list_value]

            if len(words) > 0:
                result = Vikidict().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.finded_synonims)

        self.finded_synonims = None

    def update_db(self, table, result):
        #записываем синонимы
        words_ids = []
        for line in result:
            for synonim in line['synonims']:
                word = table.objects.filter(crc32=synonim['crc32'])
                if word.exists():
                    word.update(
                        level=1,
                        parent_id=line['id'],
                        vikidict_scaned=True,
                    )
                else:
                    table.objects.create(name=synonim['synonim'],
                                         crc32=synonim['crc32'],
                                         vikidict_scaned=True)
            words_ids.append(line['id'])
        #отмечаем обработанные слова
        table.objects.filter(id__in=words_ids).update(vikidict_scaned=True)

        if self.finded_synonims == None:
            self.finded_synonims = len(words_ids)
        else:
            self.finded_synonims += len(words_ids)

    # функция очищения всех словарей
    def clear_vocabulary(self):
        Base().connection()

        for key, value in self.voc_models.items():
            value.objects.all().delete()

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None