Python Base.open Examples

Programming Language: Python

Namespace/Package Name: mainapp.daemons.base

Class/Type: Base

Method/Function: open

Examples at hotexamples.com: 5

Python Base.open - 5 examples found. These are the top rated real world Python examples of mainapp.daemons.base.Base.open extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Base(30)

open(5)

Frequently Used Methods

Base (30)

open (5)

Example #1

Show file

File: links_synonims.py Project: Ancelada/canonizator

class Program:
    def __init__(self):
        self.name = 'Поиск синонимов'
        self.file_name = 'links_synonims'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.list_value = 40
        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }
        self.finded_synonims = None

    def get_last_status(self):

        Base().connection()

        try:
            max_date = VocabularyStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = VocabularyStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = VocabularyError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = VocabularyError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = VocabularyError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if not count is None:
            status = 'Ok'
        else:
            status = 'Empty'
            count = 0

        VocabularyStatus.objects.create(status=status, count=count)

    def start(self):

        Base().connection()

        for key, table in self.voc_models.items():
            words = table.objects.filter(
                vikidict_scaned=False)[:self.list_value]

            if len(words) > 0:
                result = Vikidict().start(words)
                self.update_db(table, result)

        # сохраняем количество обработанных слов
        self.save_status(self.finded_synonims)

        self.finded_synonims = None

    def update_db(self, table, result):
        #записываем синонимы
        words_ids = []
        for line in result:
            for synonim in line['synonims']:
                word = table.objects.filter(crc32=synonim['crc32'])
                if word.exists():
                    word.update(
                        level=1,
                        parent_id=line['id'],
                        vikidict_scaned=True,
                    )
                else:
                    table.objects.create(name=synonim['synonim'],
                                         crc32=synonim['crc32'],
                                         vikidict_scaned=True)
            words_ids.append(line['id'])
        #отмечаем обработанные слова
        table.objects.filter(id__in=words_ids).update(vikidict_scaned=True)

        if self.finded_synonims == None:
            self.finded_synonims = len(words_ids)
        else:
            self.finded_synonims += len(words_ids)

    # функция очищения всех словарей
    def clear_vocabulary(self):
        Base().connection()

        for key, value in self.voc_models.items():
            value.objects.all().delete()

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None

Example #2

Show file

File: copy_publications.py Project: Ancelada/canonizator

class Program:
    def __init__(self):
        self.publications_count = 400
        self.name = 'Копирование'
        self.file_name = 'copy_publications'
        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.publication_table_columns = [
            'crawler__id',
            'crawler__name',
            'crawler__name_cyrillic',
            'title',
            'text',
            'date',
            'author',
        ]

        self.copypublication_table_columns = [
            'crawler_id',
            'title',
            'text',
            'date',
        ]

    def get_last_status(self):

        Base().connection()

        try:
            max_date = CopyPublicationStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = CopyPublicationStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = CopyPublicationError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = CopyPublicationError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = CopyPublicationError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        CopyPublicationStatus.objects.create(status=status, count=count)

    def get_date(self):

        Base().connection()

        try:
            date = CopyPublication.objects.all().aggregate( \
             Max('date'))['date__max']
        except:
            date = None
        return date

    def __remove_doubles(self, publications):
        for key, publication in enumerate(publications):
            if any(p['title'] == publication['title'] and p['text'] == publication['text'] and \
             p['date'] == publication['date'] and p['crawler__id'] == publication['crawler__id'] \
             for p in publications[key+1:]):

                del publications[key]

                return self.__remove_doubles(publications)

        return publications

    def __remove_doubles_by_copypublication_table(self, publications,
                                                  copypublications):
        for key, publication in enumerate(publications):
            if any(p['crawler_id'] == publication['crawler__id'] and p['title']==publication['title'] \
             and p['text'] == publication['text'] and p['date'] == publication['date'] \
              for p in copypublications):

                del publications[key]

                return self.__remove_doubles_by_copypublication_table(
                    publications, copypublications)
        return publications

    def push(self, date):

        Base().connection()

        if date == None:
            publications = list(
                Publication.objects.using('manager').all().values(
                    *self.publication_table_columns).order_by('date')
                [:self.publications_count])
        else:
            publications = list(
                Publication.objects.using('manager').filter(
                    date__gte=date).values(*self.publication_table_columns).
                order_by('date')[:self.publications_count])

        # убираем дубли, если они существуют в manager.Publication
        publications = self.__remove_doubles(publications)

        # убираем дубли, если они существуют в canonizator.PublicationCopy
        if date != None:

            copypublications = CopyPublication.objects.filter(
                date__gte=date -
                timedelta(days=1)).values(*self.copypublication_table_columns)

            publications = self.__remove_doubles_by_copypublication_table(
                publications, copypublications)

        # записываем в CopyPublication publications_filtered
        copypublications = []

        for publication in publications:
            copypublications.append(
                CopyPublication(
                    crawler_id=publication['crawler__id'],
                    name=publication['crawler__name'],
                    name_cyrillic=publication['crawler__name_cyrillic'],
                    title=publication['title'],
                    text=publication['text'],
                    date=publication['date'],
                    author=publication['author'],
                ))

        count = len(copypublications)

        if count > 0:

            Base().connection()

            CopyPublication.objects.bulk_create(copypublications)

        self.save_status(count)

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        date = self.get_date()
                        self.push(date)
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None

Example #3

Show file

File: normalize_publications.py Project: Ancelada/canonizator

class Program:
    def __init__(self):
        self.list_value = 400
        self.name = 'Канонизация'
        self.file_name = 'normalize_publications'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.punctuations = re.compile(
            '([-_<>?/\\".„”“%,{}@#!&()=+:;«»—$&£*])')
        self.replace_with_spaces = {
            '\n',
            '\r',
            '\r\n',
            '\v',
            '\x0b',
            '\f',
            '\x0c',
            '\x1c',
            '\x1d',
            '\x1e',
            '\x85',
            '\u2028',
            '\u2029',
            '<br>',
            '<br />'
            '<p>',
            '</p>',
            '...',
            '\t',
            '\xa0',
            '&nbsp',
            ' ',
        }
        self.copypublication_fields = [
            'crawler_id',
            'name',
            'name_cyrillic',
            'title',
            'text',
            'author',
            'date',
            'id',
        ]

        self.grammems_to_remove = {
            'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'ROMN', 'UNKN'
        }

        self.grammems_to_remove_vocabulary = {
            'NPRO': [],
            'PRED': [],
            'PREP': [],
            'CONJ': [],
            'PRCL': [],
            'INTJ': [],
            'ROMN': [],
            'UNKN': [],
        }

        self.grammems_to_remove_models = {
            'NPRO': NPRO,
            'PRED': PRED,
            'PREP': PREP,
            'CONJ': CONJ,
            'PRCL': PRCL,
            'INTJ': INTJ,
            'ROMN': ROMN,
            'UNKN': UNKN,
        }

        self.vocabulary = {
            'NOUN': [],
            'ADJF': [],
            'ADJS': [],
            'COMP': [],
            'VERB': [],
            'INFN': [],
            'PRTF': [],
            'PRTS': [],
            'GRND': [],
            'NUMR': [],
            'ADVB': [],
            'LATN': [],
            'NUMB': [],
            'intg': [],
            'real': [],
        }

        self.voc_models = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }

    def get_last_status(self):
        Base().connection()
        try:
            max_date = NormalizePublicationStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = NormalizePublicationStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):
        Base().connection()
        try:
            max_date = NormalizePublicationError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = NormalizePublicationError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def __clear_vocabulary(self, vocabulary):
        for key, value in vocabulary.items():
            del vocabulary[key][:]

    def start(self):
        last_pcopy = self.get_last_pcopy_id()
        pcopy_list = self.get_pcopy_list(last_pcopy)
        normalized_list = self.normalize(pcopy_list)
        self.save(normalized_list)

        ###############################
        # обработка словаря
        self.__remove_doubles(self.vocabulary)
        self.__remove_already_have(self.vocabulary)
        self.__add_vocabulary_to_db(self.vocabulary)

        self.__clear_vocabulary(self.vocabulary)

        # записываем граммемы to remove
        self.__remove_already_have_grammems_to_remove(
            self.grammems_to_remove_vocabulary)
        self.__add_vocabulary_grammems_to_remove_to_db(
            self.grammems_to_remove_vocabulary)

        self.__clear_vocabulary(self.grammems_to_remove_vocabulary)

    #### функция удаления дубликатов значений списков в словаре
    def __remove_doubles(self, vocabulary):
        for key in vocabulary:
            vocabulary[key] = list(unique_everseen(vocabulary[key]))

    ### функция удаления уже имеющихся в БД
    def __remove_already_have_grammems_to_remove(self, grammems_to_remove):

        Base().connection()

        for key, value in grammems_to_remove.items():
            doubles = self.grammems_to_remove_models[key].objects.filter(
                crc32__in=[word['crc32'] for word in grammems_to_remove[key]
                           ]).values('crc32')

            for double in doubles:
                for key2, word in enumerate(value):
                    if word['crc32'] == double['crc32']:
                        del value[key2]
                        break

    #### функция удаления уже имеющихся в БД
    def __remove_already_have(self, vocabulary):

        Base().connection()

        for key, value in vocabulary.items():
            doubles = self.voc_models[key].objects.filter(
                name__in=vocabulary[key]).values('name')
            for double in doubles:
                self.__remove_from_array_by_value(vocabulary[key],
                                                  double['name'])

    ##### удаление из массива по значению
    def __remove_from_array_by_value(self, array, value):
        if value in array:
            array.remove(value)

    ##### добавление в БД списков частей речи на удаление
    def __add_vocabulary_grammems_to_remove_to_db(self, grammems_to_remove):

        Base().connection()

        for key in grammems_to_remove:
            words = []
            for word in grammems_to_remove[key]:
                words.append(self.grammems_to_remove_models[key](
                    name=word['word'],
                    crc32=word['crc32'],
                ))
            if len(words) > 0:
                now = timezone.now()
                self.grammems_to_remove_models[key].objects.bulk_create(words)

    ##### добавление в БД списков частей речи
    def __add_vocabulary_to_db(self, vocabulary):

        Base().connection()

        for key in vocabulary:
            words = []
            for word in vocabulary[key]:
                words.append(self.voc_models[key](
                    name=word,
                    crc32=self.__convert_crc32(word),
                ))
            if len(words) > 0:
                now = timezone.now()
                self.voc_models[key].objects.bulk_create(words)

    def __convert_crc32(self, value):
        value_bytes = bytes(value, 'utf-8')
        return binascii.crc32(value_bytes)

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = NormalizePublicationError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        NormalizePublicationStatus.objects.create(status=status, count=count)

    def remove_punctuation(self, string):
        for key in self.replace_with_spaces:
            string = string.replace(key, ' ')
        string = re.sub(self.punctuations, '', string)
        string = string.replace('ё', 'е')
        return string

    def split_line(self, line):
        words_list = line.split(' ')
        return words_list

    def parse_to_morph(self, word):
        return self.morth.parse(word)[0]

    def check_word(self, parsed_to_morph):
        if parsed_to_morph.tag.POS in self.grammems_to_remove:
            return False
        else:
            return True

    def normalize_word(self, parsed_to_morph):
        ##### нужно уменьшить размер слова, заменить буквы ё на е
        normal_form = parsed_to_morph.normal_form
        # наполненяем словарь каждым встречающимся словом
        self.fill_vocabulary(parsed_to_morph, normal_form)
        return normal_form

    # наполнение словаря
    def fill_vocabulary(self, parsed_to_morph, normal_form):
        pos = parsed_to_morph.tag.POS
        if pos in self.vocabulary:
            self.vocabulary[pos].append(normal_form)

    def get_last_pcopy_id(self):

        Base().connection()

        try:
            last_pcopy = NormalizePublication.objects.all().aggregate( \
             Max('CopyPublication_id'))['CopyPublication_id__max']
        except:
            last_pcopy = None
        return last_pcopy

    def get_pcopy_list(self, last_pcopy):

        Base().connection()

        last_pcopy = self.get_last_pcopy_id()
        if last_pcopy != None:
            pcopy_list = CopyPublication.objects.filter(
                id__gt=last_pcopy).values(
                    *self.copypublication_fields)[:self.list_value]
        else:
            pcopy_list = CopyPublication.objects.all().values(
                *self.copypublication_fields)[:self.list_value]
        return pcopy_list

    def normalize(self, pcopy_list):
        for pcopy in pcopy_list:

            pcopy['title'] = self.remove_punctuation(pcopy['title'])
            pcopy['text'] = self.remove_punctuation(pcopy['text'])

            title = []
            title_words = {}
            self.__check_n_normalize(title, title_words,
                                     self.split_line(pcopy['title']))
            pcopy['title'] = ' '.join(title)
            pcopy['title_words'] = title_words

            text = []
            text_words = {}
            self.__check_n_normalize(text, text_words,
                                     self.split_line(pcopy['text']))
            pcopy['text'] = ' '.join(text)
            pcopy['text_words'] = text_words

        return pcopy_list

    def __check_n_normalize(self, exp_list, exp_voc_list, words):
        for word in words:
            word_parsed_to_morph = self.parse_to_morph(word)
            if self.check_word(word_parsed_to_morph):

                normalized_word = self.normalize_word(word_parsed_to_morph)
                word_crc_32 = self.__convert_crc32(normalized_word)

                # обычный список нормализованных слов
                exp_list.append(normalized_word)

                # словарь частей речи со словами crc32
                pos = str(word_parsed_to_morph.tag.POS)

                if not pos in exp_voc_list:
                    exp_voc_list[pos] = [word_crc_32]
                else:
                    exp_voc_list[pos].append(word_crc_32)
            # заполняем словарь частей речи, не учавствующих в разборе нечетких дублей
            else:

                normalized_word = self.normalize_word(word_parsed_to_morph)
                word_crc_32 = self.__convert_crc32(normalized_word)

                pos = str(word_parsed_to_morph.tag.POS)

                if not any(voc_word['word'] == normalized_word \
                 for voc_word in self.grammems_to_remove_vocabulary[pos]):
                    self.grammems_to_remove_vocabulary[pos].append({
                        'word':
                        normalized_word,
                        'crc32':
                        word_crc_32
                    })

    def save(self, normalized_list):

        Base().connection()

        normalized_publications = []
        for item in normalized_list:
            normalized_publications.append(
                NormalizePublication(
                    crawler_id=item['crawler_id'],
                    name=item['name'],
                    name_cyrillic=item['name_cyrillic'],
                    title=item['title'],
                    text=item['text'],
                    author=item['author'],
                    pubdate=item['date'],
                    CopyPublication_id=item['id'],
                    title_words=item['title_words'],
                    text_words=item['text_words'],
                ))
        count = len(normalized_publications)
        if count > 0:
            NormalizePublication.objects.bulk_create(normalized_publications)

        self.save_status(count)

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None

Example #4

Show file

File: pubcompare.py Project: Ancelada/canonizator

class Program():
    def __init__(self):
        self.pub_without_status_length = 100
        self.retrospective_days_delta = 10
        self.name = 'Поиск нечетких дубликатов'
        self.file_name = 'pubcompare'

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

    def get_last_status(self):

        Base().connection()

        try:
            max_date = PubCompareStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = PubCompareStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = PubCompareError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = PubCompareError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = PubCompareError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        PubCompareStatus.objects.create(
            status=status,
            count=count,
        )

    def __get_pub_without_status_min_date(self, pub_list):
        return pub_list[0].pubdate

    def get_pub_without_status(self):
        pub_list = NormalizePublication.objects.filter(
            status__isnull=True).exclude(title_hashes={}).order_by(
                'pubdate')[:self.pub_without_status_length]
        return pub_list

    def __get_unique_publications_min_date(self, pub_without_status_min_date):
        return pub_without_status_min_date - timezone.timedelta(
            days=self.retrospective_days_delta)

    def __get_unique_publications(self, pub_without_status_min_date):

        min_date_unique = self.__get_unique_publications_min_date(
            pub_without_status_min_date)

        pub_unique = NormalizePublication.objects.filter(
            pubdate__gt=min_date_unique,
            pubdate__lt=pub_without_status_min_date,
            status=PubCompare().status['unique']['db_value']).values(
                'id', 'title_hashes', 'text_hashes')

        result = []
        for pub in pub_unique:
            result.append({
                'id': pub['id'],
                'title_hashes': pub['title_hashes'],
                'text_hashes': pub['text_hashes']
            })

        return result

    def start(self):
        # список публикаций без статуса
        publications = Program().get_pub_without_status()

        pub_without_status_min_date = self.__get_pub_without_status_min_date(
            publications)

        # список уникальных публикаций
        unique_publications = self.__get_unique_publications(
            pub_without_status_min_date)

        # поиск статуса
        self.__search_status(publications, unique_publications)

        bulk_update(publications)

        self.save_status(len(publications))

    def __search_status(self, publications, unique_publications):
        for publication in publications:
            self.__compare_publication_with_unique_publications(
                publication, unique_publications)

    def __compare_publication_with_unique_publications(self, publication,
                                                       unique_publications):

        if len(unique_publications) > 0:
            for unique_publication in unique_publications:
                result = PubCompare().get_status(publication,
                                                 unique_publication)
                if result['status'] == 'reprint':
                    publication.status = PubCompare(
                    ).status['reprint']['db_value']
                    publication.parent_id = unique_publication['id']
                    break
                if result['status'] == 'copy':
                    publication.status = PubCompare(
                    ).status['copy']['db_value']
                    publication.parent_id = unique_publication['id']
                    break
            if publication.status == None:
                publication.status = PubCompare().status['unique']['db_value']
                self.__add_publication_in_unique_publications(
                    publication, unique_publications)
        else:
            publication.status = PubCompare().status['unique']['db_value']
            self.__add_publication_in_unique_publications(
                publication, unique_publications)

    def __add_publication_in_unique_publications(self, publication,
                                                 unique_publications):
        unique_publications.append({
            'id': publication.id,
            'title_hashes': publication.title_hashes,
            'text_hashes': publication.text_hashes,
        })

    def clear_statuses(self):
        NormalizePublication.objects.exclude(status__isnull=True).update(
            status=None)

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None

Example #5

Show file

class Program:
    def __init__(self):
        self.publications_count = 400
        self.name = 'Создание хешей публикаций'
        self.file_name = 'make_hashes'
        self.morth = pymorphy2.MorphAnalyzer()

        self.base_dir = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
        self.pids_dir = os.path.join(self.base_dir, 'daemons/pids')
        self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses')
        self.context = Base().create_daemon_context(self.file_name)

        self.vocabulary = {
            'NOUN': NOUN,
            'ADJF': ADJF,
            'ADJS': ADJS,
            'COMP': COMP,
            'VERB': VERB,
            'INFN': INFN,
            'PRTF': PRTF,
            'PRTS': PRTS,
            'GRND': GRND,
            'NUMR': NUMR,
            'ADVB': ADVB,
            'LATN': LATN,
            'NUMB': NUMB,
            'intg': intg,
            'real': real,
        }

    def get_last_status(self):

        Base().connection()

        try:
            max_date = MakeHashesStatus.objects.all().aggregate(
                Max('date'))['date__max']
            last_status = MakeHashesStatus.objects.get(date=max_date)
        except:
            last_status = 'no status'
        return last_status

    def get_last_error(self):

        Base().connection()

        try:
            max_date = MakeHashesError.objects.all().aggregate(
                Max('date'))['date__max']
            last_error = MakeHashesError.objects.get(date=max_date)
        except:
            last_error = 'no status'
        return last_error

    def save_error(self):

        Base().connection()

        e_type, e_value, e_traceback = sys.exc_info()
        error = MakeHashesError.objects.create(
            error=traceback.format_exception(e_type, e_value, e_traceback))

    def save_status(self, count):

        Base().connection()

        if count > 0:
            status = 'Ok'
        else:
            status = 'Empty'

        MakeHashesStatus.objects.create(status=status, count=count)

    def __get_all_words(self, packet):
        vocabulary = {}
        vocabulary['None'] = []

        for key, value in self.vocabulary.items():
            vocabulary[key] = []

        for line in packet:

            for key, words in line.title_words.items():
                self.__add_in_vocabulary_if_not_exists(key, words, vocabulary)

            for key, words in line.text_words.items():
                self.__add_in_vocabulary_if_not_exists(key, words, vocabulary)

        return vocabulary

    def __add_in_vocabulary_if_not_exists(self, key, words, vocabulary):
        for word in words:
            if not word in vocabulary[key]:
                vocabulary[key].append(word)

    def __replace_synonims(self, vocabulary):

        Base().connection()

        if 'None' in vocabulary:
            del vocabulary['None']

        for key, words in vocabulary.items():

            pos_words = self.vocabulary[key].objects.filter(
                crc32__in=words).values('id', 'parent_id', 'crc32')

            parent_ids = []

            for pos_word in pos_words:
                parent_ids.append(pos_word['parent_id'])

            pos_parents = self.vocabulary[key].objects.filter(
                id__in=parent_ids).values('id', 'crc32')

            result = []

            for pos_word in pos_words:
                result_line = {}
                for pos_parent in pos_parents:
                    if pos_word['parent_id'] == pos_parent['id']:
                        result_line['word_parent'] = pos_parent['crc32']
                result_line['word'] = pos_word['crc32']
                result.append(result_line)

            vocabulary[key] = result

    def __add_parent(self, result, vocabulary):
        for pos, words in vocabulary.items():
            doubled = 0
            for word in words:
                if word['word'] == result['word']:
                    if 'word_parent' in word:
                        result['word_parent'] = word['word_parent']
                        doubled = 1
                        break
            if doubled == 0:
                break

    def __link_numbers(self, poses_words, poses_hash, vocabulary):
        result = {}
        for pos, words in poses_words.items():
            result_list = []
            for word in words:
                result_line = {}
                no = self.__find_number(word, poses_hash)
                result_line['word'] = word
                result_line['no'] = no

                # добавляем родителя
                self.__add_parent(result_line, vocabulary)

                result_list.append(result_line)
            result[pos] = result_list
        return result

    def __append_n_sort(self, line_words):
        result = []
        for pos, words in line_words.items():
            if pos != 'None':
                for word in words:
                    result.append(word)

        return sorted(result, key=lambda word: word['no'])

    def __make_list_with_parents(self, line_words):
        result = []
        for line_word in line_words:
            if 'word_parent' in line_word:
                result.append(line_word['word_parent'])
            else:
                result.append(line_word['word'])
        return result

    def start(self):

        Base().connection()

        packet = NormalizePublication.objects.filter(
            title_hashes={}).order_by('pubdate')[:self.publications_count]

        # запрашиваем все слова
        vocabulary = self.__get_all_words(packet)

        # подтягиваем синонимы
        self.__replace_synonims(vocabulary)

        result = []

        for line in packet:

            result_line = {}

            title = self.__hash_list(line.title.split(' '))
            text = self.__hash_list(line.text.split(' '))

            result_line['title_hash'] = title
            result_line['text_hash'] = text

            # цепляем номера к заголовку
            result_line['title_words'] = self.__link_numbers(
                line.title_words, result_line['title_hash'], vocabulary)
            #складываем все слова
            result_line['title_words'] = self.__append_n_sort(
                result_line['title_words'])
            #создаем лист со словами
            result_line['title_words'] = self.__make_list_with_parents(
                result_line['title_words'])

            # цепляем номера к тексту
            result_line['text_words'] = self.__link_numbers(
                line.text_words, result_line['text_hash'], vocabulary)
            #складываем все слова
            result_line['text_words'] = self.__append_n_sort(
                result_line['text_words'])
            #создаем лист со словами
            result_line['text_words'] = self.__make_list_with_parents(
                result_line['text_words'])

            result.append({
                'id': line.id,
                'title_hashes': result_line['title_words'],
                'text_hashes': result_line['text_words'],
            })

        for line in packet:
            for result_line in result:
                if line.id == result_line['id']:
                    line.title_hashes = result_line['title_hashes']
                    line.text_hashes = result_line['text_hashes']

        bulk_update(packet)

        self.save_status(len(packet))

    def __append_numbers(self, list_words):
        result_list = []
        for pos, words in list_words.items():
            result_line = []
            for word in words:
                no = self.__find_number(word, list_words['title_has'])

    def __find_number(self, word_to_find, words_list):
        for key, word in enumerate(words_list):
            if word_to_find == word:
                return key

    def __hash_list(self, words_list):
        crc32 = []
        for word in words_list:
            crc32.append(binascii.crc32(bytes(word, 'utf-8')))
        return crc32

    def delete_hashes(self):

        to_delete = NormalizePublication.objects.exclude(
            title_hashes={}).update(
                title_hashes={},
                text_hashes={},
            )

    ########################################
    # запуск программы
    def run_daemon(self):
        try:
            self.context.open()
            with self.context:
                while True:
                    Base().update_working_status(self, 'waiting')
                    can_program = Base().can_program(self)
                    if can_program:
                        Base().update_working_status(self, 'working')
                        self.start()
                        Base().update_working_status(self, 'waiting')
                        Base().update_pidfile(self)
                        time.sleep(300)
                    else:
                        time.sleep(300)
        except Exception:
            self.save_error()

    def get_pid(self):

        processes = psutil.pids()

        directory = self.pids_dir
        pid_file = open(
            os.path.join(directory, '{0}.pid'.format(self.file_name)), "r")

        with pid_file:
            pid_value = int(pid_file.readlines()[0])

            pid_file.close()

            if pid_value in processes:
                return pid_value
            else:
                os.remove(
                    os.path.join(directory, '{0}.pid'.format(self.file_name)))
                return None