class Program: def __init__(self): self.name = 'Поиск синонимов' self.file_name = 'links_synonims' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.list_value = 40 self.voc_models = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, } self.finded_synonims = None def get_last_status(self): Base().connection() try: max_date = VocabularyStatus.objects.all().aggregate( Max('date'))['date__max'] last_status = VocabularyStatus.objects.get(date=max_date) except: last_status = 'no status' return last_status def get_last_error(self): Base().connection() try: max_date = VocabularyError.objects.all().aggregate( Max('date'))['date__max'] last_error = VocabularyError.objects.get(date=max_date) except: last_error = 'no status' return last_error def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = VocabularyError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback)) def save_status(self, count): Base().connection() if not count is None: status = 'Ok' else: status = 'Empty' count = 0 VocabularyStatus.objects.create(status=status, count=count) def start(self): Base().connection() for key, table in self.voc_models.items(): words = table.objects.filter( vikidict_scaned=False)[:self.list_value] if len(words) > 0: result = Vikidict().start(words) self.update_db(table, result) # сохраняем количество обработанных слов self.save_status(self.finded_synonims) self.finded_synonims = None def update_db(self, table, result): #записываем синонимы words_ids = [] for line in result: for synonim in line['synonims']: word = table.objects.filter(crc32=synonim['crc32']) if word.exists(): word.update( level=1, parent_id=line['id'], vikidict_scaned=True, ) else: table.objects.create(name=synonim['synonim'], crc32=synonim['crc32'], vikidict_scaned=True) words_ids.append(line['id']) #отмечаем обработанные слова table.objects.filter(id__in=words_ids).update(vikidict_scaned=True) if self.finded_synonims == None: self.finded_synonims = len(words_ids) else: self.finded_synonims += len(words_ids) # функция очищения всех словарей def clear_vocabulary(self): Base().connection() for key, value in self.voc_models.items(): value.objects.all().delete() ######################################## # запуск программы def run_daemon(self): try: self.context.open() with self.context: while True: Base().update_working_status(self, 'waiting') can_program = Base().can_program(self) if can_program: Base().update_working_status(self, 'working') self.start() Base().update_working_status(self, 'waiting') Base().update_pidfile(self) time.sleep(300) else: time.sleep(300) except Exception: self.save_error() def get_pid(self): processes = psutil.pids() directory = self.pids_dir pid_file = open( os.path.join(directory, '{0}.pid'.format(self.file_name)), "r") with pid_file: pid_value = int(pid_file.readlines()[0]) pid_file.close() if pid_value in processes: return pid_value else: os.remove( os.path.join(directory, '{0}.pid'.format(self.file_name))) return None
class Program: def __init__(self): self.publications_count = 400 self.name = 'Копирование' self.file_name = 'copy_publications' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.publication_table_columns = [ 'crawler__id', 'crawler__name', 'crawler__name_cyrillic', 'title', 'text', 'date', 'author', ] self.copypublication_table_columns = [ 'crawler_id', 'title', 'text', 'date', ] def get_last_status(self): Base().connection() try: max_date = CopyPublicationStatus.objects.all().aggregate( Max('date'))['date__max'] last_status = CopyPublicationStatus.objects.get(date=max_date) except: last_status = 'no status' return last_status def get_last_error(self): Base().connection() try: max_date = CopyPublicationError.objects.all().aggregate( Max('date'))['date__max'] last_error = CopyPublicationError.objects.get(date=max_date) except: last_error = 'no status' return last_error def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = CopyPublicationError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback)) def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' CopyPublicationStatus.objects.create(status=status, count=count) def get_date(self): Base().connection() try: date = CopyPublication.objects.all().aggregate( \ Max('date'))['date__max'] except: date = None return date def __remove_doubles(self, publications): for key, publication in enumerate(publications): if any(p['title'] == publication['title'] and p['text'] == publication['text'] and \ p['date'] == publication['date'] and p['crawler__id'] == publication['crawler__id'] \ for p in publications[key+1:]): del publications[key] return self.__remove_doubles(publications) return publications def __remove_doubles_by_copypublication_table(self, publications, copypublications): for key, publication in enumerate(publications): if any(p['crawler_id'] == publication['crawler__id'] and p['title']==publication['title'] \ and p['text'] == publication['text'] and p['date'] == publication['date'] \ for p in copypublications): del publications[key] return self.__remove_doubles_by_copypublication_table( publications, copypublications) return publications def push(self, date): Base().connection() if date == None: publications = list( Publication.objects.using('manager').all().values( *self.publication_table_columns).order_by('date') [:self.publications_count]) else: publications = list( Publication.objects.using('manager').filter( date__gte=date).values(*self.publication_table_columns). order_by('date')[:self.publications_count]) # убираем дубли, если они существуют в manager.Publication publications = self.__remove_doubles(publications) # убираем дубли, если они существуют в canonizator.PublicationCopy if date != None: copypublications = CopyPublication.objects.filter( date__gte=date - timedelta(days=1)).values(*self.copypublication_table_columns) publications = self.__remove_doubles_by_copypublication_table( publications, copypublications) # записываем в CopyPublication publications_filtered copypublications = [] for publication in publications: copypublications.append( CopyPublication( crawler_id=publication['crawler__id'], name=publication['crawler__name'], name_cyrillic=publication['crawler__name_cyrillic'], title=publication['title'], text=publication['text'], date=publication['date'], author=publication['author'], )) count = len(copypublications) if count > 0: Base().connection() CopyPublication.objects.bulk_create(copypublications) self.save_status(count) ######################################## # запуск программы def run_daemon(self): try: self.context.open() with self.context: while True: Base().update_working_status(self, 'waiting') can_program = Base().can_program(self) if can_program: Base().update_working_status(self, 'working') date = self.get_date() self.push(date) Base().update_working_status(self, 'waiting') Base().update_pidfile(self) time.sleep(300) else: time.sleep(300) except Exception: self.save_error() def get_pid(self): processes = psutil.pids() directory = self.pids_dir pid_file = open( os.path.join(directory, '{0}.pid'.format(self.file_name)), "r") with pid_file: pid_value = int(pid_file.readlines()[0]) pid_file.close() if pid_value in processes: return pid_value else: os.remove( os.path.join(directory, '{0}.pid'.format(self.file_name))) return None
class Program: def __init__(self): self.list_value = 400 self.name = 'Канонизация' self.file_name = 'normalize_publications' self.morth = pymorphy2.MorphAnalyzer() self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.punctuations = re.compile( '([-_<>?/\\".„”“%,{}@#!&()=+:;«»—$&£*])') self.replace_with_spaces = { '\n', '\r', '\r\n', '\v', '\x0b', '\f', '\x0c', '\x1c', '\x1d', '\x1e', '\x85', '\u2028', '\u2029', '<br>', '<br />' '<p>', '</p>', '...', '\t', '\xa0', ' ', ' ', } self.copypublication_fields = [ 'crawler_id', 'name', 'name_cyrillic', 'title', 'text', 'author', 'date', 'id', ] self.grammems_to_remove = { 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'ROMN', 'UNKN' } self.grammems_to_remove_vocabulary = { 'NPRO': [], 'PRED': [], 'PREP': [], 'CONJ': [], 'PRCL': [], 'INTJ': [], 'ROMN': [], 'UNKN': [], } self.grammems_to_remove_models = { 'NPRO': NPRO, 'PRED': PRED, 'PREP': PREP, 'CONJ': CONJ, 'PRCL': PRCL, 'INTJ': INTJ, 'ROMN': ROMN, 'UNKN': UNKN, } self.vocabulary = { 'NOUN': [], 'ADJF': [], 'ADJS': [], 'COMP': [], 'VERB': [], 'INFN': [], 'PRTF': [], 'PRTS': [], 'GRND': [], 'NUMR': [], 'ADVB': [], 'LATN': [], 'NUMB': [], 'intg': [], 'real': [], } self.voc_models = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, } def get_last_status(self): Base().connection() try: max_date = NormalizePublicationStatus.objects.all().aggregate( Max('date'))['date__max'] last_status = NormalizePublicationStatus.objects.get(date=max_date) except: last_status = 'no status' return last_status def get_last_error(self): Base().connection() try: max_date = NormalizePublicationError.objects.all().aggregate( Max('date'))['date__max'] last_error = NormalizePublicationError.objects.get(date=max_date) except: last_error = 'no status' return last_error def __clear_vocabulary(self, vocabulary): for key, value in vocabulary.items(): del vocabulary[key][:] def start(self): last_pcopy = self.get_last_pcopy_id() pcopy_list = self.get_pcopy_list(last_pcopy) normalized_list = self.normalize(pcopy_list) self.save(normalized_list) ############################### # обработка словаря self.__remove_doubles(self.vocabulary) self.__remove_already_have(self.vocabulary) self.__add_vocabulary_to_db(self.vocabulary) self.__clear_vocabulary(self.vocabulary) # записываем граммемы to remove self.__remove_already_have_grammems_to_remove( self.grammems_to_remove_vocabulary) self.__add_vocabulary_grammems_to_remove_to_db( self.grammems_to_remove_vocabulary) self.__clear_vocabulary(self.grammems_to_remove_vocabulary) #### функция удаления дубликатов значений списков в словаре def __remove_doubles(self, vocabulary): for key in vocabulary: vocabulary[key] = list(unique_everseen(vocabulary[key])) ### функция удаления уже имеющихся в БД def __remove_already_have_grammems_to_remove(self, grammems_to_remove): Base().connection() for key, value in grammems_to_remove.items(): doubles = self.grammems_to_remove_models[key].objects.filter( crc32__in=[word['crc32'] for word in grammems_to_remove[key] ]).values('crc32') for double in doubles: for key2, word in enumerate(value): if word['crc32'] == double['crc32']: del value[key2] break #### функция удаления уже имеющихся в БД def __remove_already_have(self, vocabulary): Base().connection() for key, value in vocabulary.items(): doubles = self.voc_models[key].objects.filter( name__in=vocabulary[key]).values('name') for double in doubles: self.__remove_from_array_by_value(vocabulary[key], double['name']) ##### удаление из массива по значению def __remove_from_array_by_value(self, array, value): if value in array: array.remove(value) ##### добавление в БД списков частей речи на удаление def __add_vocabulary_grammems_to_remove_to_db(self, grammems_to_remove): Base().connection() for key in grammems_to_remove: words = [] for word in grammems_to_remove[key]: words.append(self.grammems_to_remove_models[key]( name=word['word'], crc32=word['crc32'], )) if len(words) > 0: now = timezone.now() self.grammems_to_remove_models[key].objects.bulk_create(words) ##### добавление в БД списков частей речи def __add_vocabulary_to_db(self, vocabulary): Base().connection() for key in vocabulary: words = [] for word in vocabulary[key]: words.append(self.voc_models[key]( name=word, crc32=self.__convert_crc32(word), )) if len(words) > 0: now = timezone.now() self.voc_models[key].objects.bulk_create(words) def __convert_crc32(self, value): value_bytes = bytes(value, 'utf-8') return binascii.crc32(value_bytes) def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = NormalizePublicationError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback)) def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' NormalizePublicationStatus.objects.create(status=status, count=count) def remove_punctuation(self, string): for key in self.replace_with_spaces: string = string.replace(key, ' ') string = re.sub(self.punctuations, '', string) string = string.replace('ё', 'е') return string def split_line(self, line): words_list = line.split(' ') return words_list def parse_to_morph(self, word): return self.morth.parse(word)[0] def check_word(self, parsed_to_morph): if parsed_to_morph.tag.POS in self.grammems_to_remove: return False else: return True def normalize_word(self, parsed_to_morph): ##### нужно уменьшить размер слова, заменить буквы ё на е normal_form = parsed_to_morph.normal_form # наполненяем словарь каждым встречающимся словом self.fill_vocabulary(parsed_to_morph, normal_form) return normal_form # наполнение словаря def fill_vocabulary(self, parsed_to_morph, normal_form): pos = parsed_to_morph.tag.POS if pos in self.vocabulary: self.vocabulary[pos].append(normal_form) def get_last_pcopy_id(self): Base().connection() try: last_pcopy = NormalizePublication.objects.all().aggregate( \ Max('CopyPublication_id'))['CopyPublication_id__max'] except: last_pcopy = None return last_pcopy def get_pcopy_list(self, last_pcopy): Base().connection() last_pcopy = self.get_last_pcopy_id() if last_pcopy != None: pcopy_list = CopyPublication.objects.filter( id__gt=last_pcopy).values( *self.copypublication_fields)[:self.list_value] else: pcopy_list = CopyPublication.objects.all().values( *self.copypublication_fields)[:self.list_value] return pcopy_list def normalize(self, pcopy_list): for pcopy in pcopy_list: pcopy['title'] = self.remove_punctuation(pcopy['title']) pcopy['text'] = self.remove_punctuation(pcopy['text']) title = [] title_words = {} self.__check_n_normalize(title, title_words, self.split_line(pcopy['title'])) pcopy['title'] = ' '.join(title) pcopy['title_words'] = title_words text = [] text_words = {} self.__check_n_normalize(text, text_words, self.split_line(pcopy['text'])) pcopy['text'] = ' '.join(text) pcopy['text_words'] = text_words return pcopy_list def __check_n_normalize(self, exp_list, exp_voc_list, words): for word in words: word_parsed_to_morph = self.parse_to_morph(word) if self.check_word(word_parsed_to_morph): normalized_word = self.normalize_word(word_parsed_to_morph) word_crc_32 = self.__convert_crc32(normalized_word) # обычный список нормализованных слов exp_list.append(normalized_word) # словарь частей речи со словами crc32 pos = str(word_parsed_to_morph.tag.POS) if not pos in exp_voc_list: exp_voc_list[pos] = [word_crc_32] else: exp_voc_list[pos].append(word_crc_32) # заполняем словарь частей речи, не учавствующих в разборе нечетких дублей else: normalized_word = self.normalize_word(word_parsed_to_morph) word_crc_32 = self.__convert_crc32(normalized_word) pos = str(word_parsed_to_morph.tag.POS) if not any(voc_word['word'] == normalized_word \ for voc_word in self.grammems_to_remove_vocabulary[pos]): self.grammems_to_remove_vocabulary[pos].append({ 'word': normalized_word, 'crc32': word_crc_32 }) def save(self, normalized_list): Base().connection() normalized_publications = [] for item in normalized_list: normalized_publications.append( NormalizePublication( crawler_id=item['crawler_id'], name=item['name'], name_cyrillic=item['name_cyrillic'], title=item['title'], text=item['text'], author=item['author'], pubdate=item['date'], CopyPublication_id=item['id'], title_words=item['title_words'], text_words=item['text_words'], )) count = len(normalized_publications) if count > 0: NormalizePublication.objects.bulk_create(normalized_publications) self.save_status(count) ######################################## # запуск программы def run_daemon(self): try: self.context.open() with self.context: while True: Base().update_working_status(self, 'waiting') can_program = Base().can_program(self) if can_program: Base().update_working_status(self, 'working') self.start() Base().update_working_status(self, 'waiting') Base().update_pidfile(self) time.sleep(300) else: time.sleep(300) except Exception: self.save_error() def get_pid(self): processes = psutil.pids() directory = self.pids_dir pid_file = open( os.path.join(directory, '{0}.pid'.format(self.file_name)), "r") with pid_file: pid_value = int(pid_file.readlines()[0]) pid_file.close() if pid_value in processes: return pid_value else: os.remove( os.path.join(directory, '{0}.pid'.format(self.file_name))) return None
class Program(): def __init__(self): self.pub_without_status_length = 100 self.retrospective_days_delta = 10 self.name = 'Поиск нечетких дубликатов' self.file_name = 'pubcompare' self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) def get_last_status(self): Base().connection() try: max_date = PubCompareStatus.objects.all().aggregate( Max('date'))['date__max'] last_status = PubCompareStatus.objects.get(date=max_date) except: last_status = 'no status' return last_status def get_last_error(self): Base().connection() try: max_date = PubCompareError.objects.all().aggregate( Max('date'))['date__max'] last_error = PubCompareError.objects.get(date=max_date) except: last_error = 'no status' return last_error def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = PubCompareError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback)) def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' PubCompareStatus.objects.create( status=status, count=count, ) def __get_pub_without_status_min_date(self, pub_list): return pub_list[0].pubdate def get_pub_without_status(self): pub_list = NormalizePublication.objects.filter( status__isnull=True).exclude(title_hashes={}).order_by( 'pubdate')[:self.pub_without_status_length] return pub_list def __get_unique_publications_min_date(self, pub_without_status_min_date): return pub_without_status_min_date - timezone.timedelta( days=self.retrospective_days_delta) def __get_unique_publications(self, pub_without_status_min_date): min_date_unique = self.__get_unique_publications_min_date( pub_without_status_min_date) pub_unique = NormalizePublication.objects.filter( pubdate__gt=min_date_unique, pubdate__lt=pub_without_status_min_date, status=PubCompare().status['unique']['db_value']).values( 'id', 'title_hashes', 'text_hashes') result = [] for pub in pub_unique: result.append({ 'id': pub['id'], 'title_hashes': pub['title_hashes'], 'text_hashes': pub['text_hashes'] }) return result def start(self): # список публикаций без статуса publications = Program().get_pub_without_status() pub_without_status_min_date = self.__get_pub_without_status_min_date( publications) # список уникальных публикаций unique_publications = self.__get_unique_publications( pub_without_status_min_date) # поиск статуса self.__search_status(publications, unique_publications) bulk_update(publications) self.save_status(len(publications)) def __search_status(self, publications, unique_publications): for publication in publications: self.__compare_publication_with_unique_publications( publication, unique_publications) def __compare_publication_with_unique_publications(self, publication, unique_publications): if len(unique_publications) > 0: for unique_publication in unique_publications: result = PubCompare().get_status(publication, unique_publication) if result['status'] == 'reprint': publication.status = PubCompare( ).status['reprint']['db_value'] publication.parent_id = unique_publication['id'] break if result['status'] == 'copy': publication.status = PubCompare( ).status['copy']['db_value'] publication.parent_id = unique_publication['id'] break if publication.status == None: publication.status = PubCompare().status['unique']['db_value'] self.__add_publication_in_unique_publications( publication, unique_publications) else: publication.status = PubCompare().status['unique']['db_value'] self.__add_publication_in_unique_publications( publication, unique_publications) def __add_publication_in_unique_publications(self, publication, unique_publications): unique_publications.append({ 'id': publication.id, 'title_hashes': publication.title_hashes, 'text_hashes': publication.text_hashes, }) def clear_statuses(self): NormalizePublication.objects.exclude(status__isnull=True).update( status=None) ######################################## # запуск программы def run_daemon(self): try: self.context.open() with self.context: while True: Base().update_working_status(self, 'waiting') can_program = Base().can_program(self) if can_program: Base().update_working_status(self, 'working') self.start() Base().update_working_status(self, 'waiting') Base().update_pidfile(self) time.sleep(300) else: time.sleep(300) except Exception: self.save_error() def get_pid(self): processes = psutil.pids() directory = self.pids_dir pid_file = open( os.path.join(directory, '{0}.pid'.format(self.file_name)), "r") with pid_file: pid_value = int(pid_file.readlines()[0]) pid_file.close() if pid_value in processes: return pid_value else: os.remove( os.path.join(directory, '{0}.pid'.format(self.file_name))) return None
class Program: def __init__(self): self.publications_count = 400 self.name = 'Создание хешей публикаций' self.file_name = 'make_hashes' self.morth = pymorphy2.MorphAnalyzer() self.base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) self.pids_dir = os.path.join(self.base_dir, 'daemons/pids') self.statuses_dir = os.path.join(self.base_dir, 'daemons/statuses') self.context = Base().create_daemon_context(self.file_name) self.vocabulary = { 'NOUN': NOUN, 'ADJF': ADJF, 'ADJS': ADJS, 'COMP': COMP, 'VERB': VERB, 'INFN': INFN, 'PRTF': PRTF, 'PRTS': PRTS, 'GRND': GRND, 'NUMR': NUMR, 'ADVB': ADVB, 'LATN': LATN, 'NUMB': NUMB, 'intg': intg, 'real': real, } def get_last_status(self): Base().connection() try: max_date = MakeHashesStatus.objects.all().aggregate( Max('date'))['date__max'] last_status = MakeHashesStatus.objects.get(date=max_date) except: last_status = 'no status' return last_status def get_last_error(self): Base().connection() try: max_date = MakeHashesError.objects.all().aggregate( Max('date'))['date__max'] last_error = MakeHashesError.objects.get(date=max_date) except: last_error = 'no status' return last_error def save_error(self): Base().connection() e_type, e_value, e_traceback = sys.exc_info() error = MakeHashesError.objects.create( error=traceback.format_exception(e_type, e_value, e_traceback)) def save_status(self, count): Base().connection() if count > 0: status = 'Ok' else: status = 'Empty' MakeHashesStatus.objects.create(status=status, count=count) def __get_all_words(self, packet): vocabulary = {} vocabulary['None'] = [] for key, value in self.vocabulary.items(): vocabulary[key] = [] for line in packet: for key, words in line.title_words.items(): self.__add_in_vocabulary_if_not_exists(key, words, vocabulary) for key, words in line.text_words.items(): self.__add_in_vocabulary_if_not_exists(key, words, vocabulary) return vocabulary def __add_in_vocabulary_if_not_exists(self, key, words, vocabulary): for word in words: if not word in vocabulary[key]: vocabulary[key].append(word) def __replace_synonims(self, vocabulary): Base().connection() if 'None' in vocabulary: del vocabulary['None'] for key, words in vocabulary.items(): pos_words = self.vocabulary[key].objects.filter( crc32__in=words).values('id', 'parent_id', 'crc32') parent_ids = [] for pos_word in pos_words: parent_ids.append(pos_word['parent_id']) pos_parents = self.vocabulary[key].objects.filter( id__in=parent_ids).values('id', 'crc32') result = [] for pos_word in pos_words: result_line = {} for pos_parent in pos_parents: if pos_word['parent_id'] == pos_parent['id']: result_line['word_parent'] = pos_parent['crc32'] result_line['word'] = pos_word['crc32'] result.append(result_line) vocabulary[key] = result def __add_parent(self, result, vocabulary): for pos, words in vocabulary.items(): doubled = 0 for word in words: if word['word'] == result['word']: if 'word_parent' in word: result['word_parent'] = word['word_parent'] doubled = 1 break if doubled == 0: break def __link_numbers(self, poses_words, poses_hash, vocabulary): result = {} for pos, words in poses_words.items(): result_list = [] for word in words: result_line = {} no = self.__find_number(word, poses_hash) result_line['word'] = word result_line['no'] = no # добавляем родителя self.__add_parent(result_line, vocabulary) result_list.append(result_line) result[pos] = result_list return result def __append_n_sort(self, line_words): result = [] for pos, words in line_words.items(): if pos != 'None': for word in words: result.append(word) return sorted(result, key=lambda word: word['no']) def __make_list_with_parents(self, line_words): result = [] for line_word in line_words: if 'word_parent' in line_word: result.append(line_word['word_parent']) else: result.append(line_word['word']) return result def start(self): Base().connection() packet = NormalizePublication.objects.filter( title_hashes={}).order_by('pubdate')[:self.publications_count] # запрашиваем все слова vocabulary = self.__get_all_words(packet) # подтягиваем синонимы self.__replace_synonims(vocabulary) result = [] for line in packet: result_line = {} title = self.__hash_list(line.title.split(' ')) text = self.__hash_list(line.text.split(' ')) result_line['title_hash'] = title result_line['text_hash'] = text # цепляем номера к заголовку result_line['title_words'] = self.__link_numbers( line.title_words, result_line['title_hash'], vocabulary) #складываем все слова result_line['title_words'] = self.__append_n_sort( result_line['title_words']) #создаем лист со словами result_line['title_words'] = self.__make_list_with_parents( result_line['title_words']) # цепляем номера к тексту result_line['text_words'] = self.__link_numbers( line.text_words, result_line['text_hash'], vocabulary) #складываем все слова result_line['text_words'] = self.__append_n_sort( result_line['text_words']) #создаем лист со словами result_line['text_words'] = self.__make_list_with_parents( result_line['text_words']) result.append({ 'id': line.id, 'title_hashes': result_line['title_words'], 'text_hashes': result_line['text_words'], }) for line in packet: for result_line in result: if line.id == result_line['id']: line.title_hashes = result_line['title_hashes'] line.text_hashes = result_line['text_hashes'] bulk_update(packet) self.save_status(len(packet)) def __append_numbers(self, list_words): result_list = [] for pos, words in list_words.items(): result_line = [] for word in words: no = self.__find_number(word, list_words['title_has']) def __find_number(self, word_to_find, words_list): for key, word in enumerate(words_list): if word_to_find == word: return key def __hash_list(self, words_list): crc32 = [] for word in words_list: crc32.append(binascii.crc32(bytes(word, 'utf-8'))) return crc32 def delete_hashes(self): to_delete = NormalizePublication.objects.exclude( title_hashes={}).update( title_hashes={}, text_hashes={}, ) ######################################## # запуск программы def run_daemon(self): try: self.context.open() with self.context: while True: Base().update_working_status(self, 'waiting') can_program = Base().can_program(self) if can_program: Base().update_working_status(self, 'working') self.start() Base().update_working_status(self, 'waiting') Base().update_pidfile(self) time.sleep(300) else: time.sleep(300) except Exception: self.save_error() def get_pid(self): processes = psutil.pids() directory = self.pids_dir pid_file = open( os.path.join(directory, '{0}.pid'.format(self.file_name)), "r") with pid_file: pid_value = int(pid_file.readlines()[0]) pid_file.close() if pid_value in processes: return pid_value else: os.remove( os.path.join(directory, '{0}.pid'.format(self.file_name))) return None