def get_target_object(raw_target): list_without_comments = [] for line in raw_target: tmp_line = util.remove_forward_and_back_spaces(line.split('#')[0]) if tmp_line: list_without_comments.append(tmp_line) # TODO(zaqwes): очистить от комментов target = {} def process_one_line(line): ptr = line.find(':') key = util.remove_forward_and_back_spaces(line[0:ptr]) value = util.remove_forward_and_back_spaces(line[ptr+1:]) if key == kKeyRoot: if key not in target: target[key] = [] target[key].append(value) target[key] = list(set(target[key])) elif key == kKeyTargetExts: list_ext = value.split(',') target[key] = list(util.remove_fandb_spaces_in_tuple(tuple(list_ext))) elif key == kKeyIndexName: # Тоже список, хотя из одного элемента. Удобно при дальнейшей обработке target[key] = [util.remove_forward_and_back_spaces(value)] elif key == kKeyIgnoredDir: if key not in target: target[key] = [] target[key].append(value) target[key] = list(set(target[key])) else: print 'No used' map(process_one_line, list_without_comments) return target
def process_one_line(line): ptr = line.find(':') key = util.remove_forward_and_back_spaces(line[0:ptr]) value = util.remove_forward_and_back_spaces(line[ptr+1:]) if key == kKeyRoot: if key not in target: target[key] = [] target[key].append(value) target[key] = list(set(target[key])) elif key == kKeyTargetExts: list_ext = value.split(',') target[key] = list(util.remove_fandb_spaces_in_tuple(tuple(list_ext))) elif key == kKeyIndexName: # Тоже список, хотя из одного элемента. Удобно при дальнейшей обработке target[key] = [util.remove_forward_and_back_spaces(value)] elif key == kKeyIgnoredDir: if key not in target: target[key] = [] target[key].append(value) target[key] = list(set(target[key])) else: print 'No used'
def _parse_target_params(str_params): rpt = [] if str_params.count('[') != str_params.count(']'): return None, 1, "\tError: [Count '[' != count ']']" if str_params.count(':') != str_params.count('[') or \ str_params.count(':') != str_params.count(']'): return None, 1, "\tError: [Format param - [anything : something]]" params = str_params.replace('[', '') params = params.split(']') params_map = {} for at in params: if at: pair = remove_forward_and_back_spaces(at) key, value = remove_fandb_spaces_in_tuple(tuple(pair.split(':'))) # Запрещаем второе значени, соотв. ключу if key not in params_map: params_map[key] = value else: return None, 1, "\tError: only one params key permitted" params_json = json.dumps(params_map) return params_json, 0, None
def parser_target_for_spider(target_fname): """ Thinks: А что если файл пустой? TODO: Сделать кастомизацию преобразоватлелей в текст """ sets = dal.get_utf8_template() sets['name'] = target_fname list_lines, err = dal.efile2list(sets) if err[0]: rpt = err[1] yield None, 1, rpt return # Можно обрабатывать list_without_comments = map( lambda line: remove_forward_and_back_spaces(line.split('#')[0]), list_lines) # Удаление пустых строк result_job_list = [] map(lambda line: result_job_list.append(line) if line \ else None, list_without_comments) # В первой информационной строке должно быть имя узла if not is_node(result_job_list[0]): rpt = 'target_fname: '+target_fname+ \ '. Неверный формат файла - первое имя узла должно быть до адресов.'+ \ 'Либо файл с заданиями пуст.' code_err = 2 yield None, code_err, rpt return current_node = get_node_name(result_job_list[0]) i = 0 nodes = [] for at in result_job_list: if is_node(at): current_node = get_node_name(at) if current_node not in nodes: nodes.append(current_node) else: code_err = 2 yield None, code_err, 'Name node: ['+current_node+ \ ']\n'+"\tError: Node name need be unic." i = 0 else: i += 1 # Выделяем обработчик pos_first_settings_item = at.find('[') if pos_first_settings_item != -1: url = remove_forward_and_back_spaces( at[:pos_first_settings_item]) params = at[pos_first_settings_item:] params, code_err, rpt = _parse_target_params(params) if code_err != 0 and rpt: rpt = 'Name node: ['+current_node+']\nUrl: ['+url+']\n'+rpt yield (current_node, url, i, params), 0, rpt else: url = remove_forward_and_back_spaces(at) rpt = None yield (current_node, url, i, '{}'), 0, rpt
def get_one_node(line): line = line.split('*')[0] line = line.replace('[','') node = util.remove_forward_and_back_spaces(line.replace(']','')) return node
def get_node_name(src_node_name): return remove_forward_and_back_spaces( src_node_name.replace('[', '').replace(']', ''))
def is_node(line): line = remove_forward_and_back_spaces(line) if line[0] == '[' and line[-1] == ']': return True else: return False
def get_url(line): line = line.split('*')[1] node = util.remove_forward_and_back_spaces(line) return node