def parse_urls(self, html): """ Produces a list of URLs present in the given html. :type html: str :rtype: list """ soup = BeautifulSoup(html, "html.parser") urls = [] # (presumably) only in the main page for element in soup.findAll("h2", {"class": "section-heading"}): if element.a: url = element.a.get("href") if url not in self.visited_urls: urls.append(Utility.clean_url(url)) # in main page and appears as relevant articles for element in soup.findAll("a", {"class": "story-link"}): url = element.get("href") if url not in self.visited_urls: urls.append(Utility.clean_url(url)) return urls
def load_data(): """Loads, cleans and transforms the raw text data and returns the tokenized source and target phrases""" data_loader = dl.DialogLoaderTransformer( data_directory=dl.DATA_DIRECTORY, delimiter=dl.DELIMITER, movie_titles_headers=dl.MOVIES_TITLE_HEADERS, movie_lines_headers=dl.MOVIE_LINES_HEADERS, movie_conversation_headers=dl.MOVE_CONVERSATION_SEQUENCE_HEADERS) # loading and cleaning source_texts, target_texts = data_loader.get_training_data(genre=GENRE, shuffle=True) tokenizer.fit_on_text(source_texts + target_texts, min_keep_frequency=MIN_TOKEN_FREQ) # converting texts to numbers source_sequences = tokenizer.convert_text_to_number(source_texts) target_sequences = tokenizer.convert_text_to_number(target_texts) source_sequences, target_sequences = tokenizer.filter( source_numbers=source_sequences, target_numbers=target_sequences, max_token_size=MAX_TOKEN_LENGTH, remove_unknown=True) # converting numbers to tensors source_sequences = Utility.tensorize(source_sequences, dtype=torch.long, device=DEVICE) target_sequences = Utility.tensorize(target_sequences, dtype=torch.long, device=DEVICE) return source_sequences, target_sequences
def __init__(self): self.ignored = 0 self.init_url = settings.init_url self.pattern = re.compile(settings.nyc_regex) self.remaining = settings.max_pages self.visited_urls = set() Utility.reset_cache(settings.cache_directory)
def setUp(self): random.seed(0) self.id1 = random.randint(1, 10000) self.id2 = random.randint(1, 10000) self.name1 = str(uuid.uuid4()) self.name2 = str(uuid.uuid4()) self.catalog = Catalog() self.catalog.add_table(SkeletonFile(self.id1, Utility.get_tupledesc(2)), self.name1) self.catalog.add_table(SkeletonFile(self.id2, Utility.get_tupledesc(2)), self.name2)
def __init__(self): self.nyc_regex = settings.nyc_regex try: self.files = [file for file in os.listdir(settings.cache_directory)] except FileNotFoundError: print("Nothing to scrape") self.files = [] Utility.reset_cache(settings.output_directory)
def __init__(self): self.utility = Utility() self.pincode = PinCode() self.phone_number_lookup = PhoneNumberLookup() self.phone_number = PhoneNumber(self.phone_number_lookup) self.output_dir = "output_dir" self.ms_office = MsOffice() self.utility = Utility() self.state_mapper = StateMapper() self.district_mapper = DistrictMapper()
def __init__(self): self.ignored = 0 self.max_pages = settings.max_pages self.pattern = re.compile(settings.nyc_regex) self.url_queue = deque() self.url_queue.append(settings.init_url) self.visited_pages = 0 self.visited_urls = set() Utility.reset_cache(settings.cache_directory)
def test_combine(self): td1 = Utility.get_tupledesc(1, "td1") td2 = Utility.get_tupledesc(2, "td2") td3 = TupleDesc.merge(td1, td2) self.assertEqual(3, td3.num_fields()) self.assertEqual(3 * IntType.get_len(), td3.get_size()) for i in range(3): self.assertEqual(IntType, td3.get_field_type(i)) self.assertEqual(self.combined_string_arrays(td1, td2, td3), True)
def test_duplicate_ids(self): new_name = str(uuid.uuid4()) f = SkeletonFile(self.id2, Utility.get_tupledesc(2)) self.catalog.add_table(f, new_name) self.assertEqual(new_name, self.catalog.get_table_name(self.id2)) self.assertEqual(f, self.catalog.get_database_file(self.id2))
def __init__(self, phone_lookup): self.utility = Utility() phone_number_prefixes = [ " contact no ", " mobile, no ", " mobail no ", " mobile no ", " mobal nbr ", "mobail", "mobail no", " phone no ", " mobil no ", " cell no ", " cell ", " noumber ", " contact ", " mobile/", " mob no*", " wtsp ", " mobile ", " mb nbr ", " mob no ", " m no/*", " phone ", ",ph no ", " po no ", " mobil ", " ph no ", " m no_*", " m no/ ", " mob/*", " c no ", " phon ", " m no ", "phone ", " mob,", ",no *", " mob ", " mb ", " mob*", " no *", ",mo *", " pn ", " po ", " ph ", " nm ", " mo ", " m *", " number ", " nub ", " mob nub-", "no,", "phn num,", "phn num" ] self.phone_number_prefixes = self.utility.reverse_list( sorted(list(set(phone_number_prefixes)), key=len)) self.phone_lookup = phone_lookup
def generate_n_best_move(self, current_state, possible_moves, n=5): """Generate n best moves for Optimized Hill Climbing Algorithm Args: current_state (State): current state possible_moves (list(dict(from, to))): List possible moves of current player n (int, optional): Number of generated moves. Defaults to 5. Returns: list(dict(from, to)): List of dictionary from and to """ arr_tup = [] idx_moves = 0 temp_state = current_state.deepcopy() for possible_move in possible_moves: idx_to = 0 for possible_to in possible_move['to']: temp_state.board.move_pawn(possible_move['from'], possible_to) arr_tup.append( (Utility.utility_function(temp_state), [idx_moves, idx_to])) temp_state.board.move_pawn(possible_to, possible_move['from']) idx_to += 1 idx_moves += 1 n_best = sorted(arr_tup, key=lambda x: x[0], reverse=not (current_state.currentPlayer == current_state.player_2))[:n] result = [{ 'from': possible_moves[loc[0]]['from'], 'to': [possible_moves[loc[0]]['to'][loc[1]]] } for _, loc in n_best] return result
def local_search(self, current_state, possible_moves, algorithm="SA"): """Local search using Simulated Annealing Algorithm or Optimized Hill-Climbing Args: current_state (State): Current state possible_moves (list(dict(from, to))): List possible moves of current player depth (int): Depth Returns: (list(dict(from, to))): new possible_moves with less possible moves """ # 1/5 dari batas waktu setiap depth (asumsi waktu alokasi tiap depth uniform, # dan butuh 4/5 waktu untuk menelusuri pohon) if (algorithm == "SA"): sa_time = time() + self.t_limit / (self.max_depth * 5) current_value = Utility.utility_function(current_state) generated_moves = [] while True: curr_time = sa_time - time() if curr_time <= 0 or not possible_moves: return generated_moves next_move, possible_moves = self.generate_random_move( possible_moves) if next_move: delta_e = self.generate_delta_e(next_move, current_state, current_value) if delta_e > 0: generated_moves.append(next_move) elif exp(delta_e / curr_time): generated_moves.append(next_move) else: return self.generate_n_best_move(current_state, possible_moves)
def minimax(self, state, is_max, depth=0, alpha=float("-inf"), beta=float("inf"), algorithm="optimized"): """Minimax + Local Search Algorithm for solving Halma Checker Parameters: state (State): game state is_max (Boolean): is maxing the objective value Returns: Tuple: Tuple of (best_move objective value, best_move) """ # Terminate if self.terminate(depth, state): return None, Utility.utility_function(state) # Recursive possible_moves = state.current_player_possible_moves() if self.which_player == state.currentPlayer and depth == 0: # Jika giliran bot player, maka jalankan localsearch pada depth = 0 # untuk mengambil beberapa possible moves saja possible_moves = self.local_search(state, possible_moves, algorithm) #Jika bukan bot, pertimbangkan semua moves return self.search(is_max, possible_moves, state, depth, alpha, beta) #minimax search
def print_df(self, filename, df): try: util = Utility() filewritetime = datetime.datetime.now() filepath = self.output_path + "\\%s.txt" % (filename) filepath = "%s.txt" % (filename) f = open(filepath, 'wb') writer = csv.writer(f) writer.writerows(df) f.close() util.printKeyValue(' write file Time diff', datetime.datetime.now() - filewritetime, ' ', True, True) return filepath except Exception as inst: print type(inst) print inst.args
def print_df(self, filename, df): try: util = Utility() filewritetime = datetime.datetime.now() filepath = self.output_path + "\\%s.txt"%(filename) filepath = "%s.txt"%(filename) f = open(filepath, 'wb') writer = csv.writer(f) writer.writerows(df) f.close() util.printKeyValue( ' write file Time diff', datetime.datetime.now() - filewritetime, ' ', True, True) return filepath except Exception as inst: print type(inst) print inst.args
def __init__(self, validation, verbose, json_string, time) -> None: super().__init__() self.__validation = validation self.__verbose = verbose self.__json = Utility.load_json(json_string) self.__time = time self.print()
def test_split_train_test_splits_based_on_fraction(self): """tests the methods properly splits the input sequence.""" sources = [1, 2, 3, 4, 5, 6, 7, 8, 9] targets = [11, 22, 33, 44, 55, 66, 77, 88, 99] test_fraction = 0.4 train_source, train_target, test_source, test_target = Utility.split_train_test( sources, targets, test_fraction=test_fraction) self.assertEqual(train_source, [4, 5, 6, 7, 8, 9]) self.assertEqual(train_target, [44, 55, 66, 77, 88, 99]) self.assertEqual(test_source, [1, 2, 3]) self.assertEqual(test_target, [11, 22, 33])
def __init__(self, schema, json_string) -> None: super().__init__() self.__schema = schema self.__json = Utility.load_json(json_string) self.__message = None self.__is_valid = True # validate self.validate_json()
def parse_urls(self, html): """ Appends new URLs present in the given html too the URL queue. :type html: str """ soup = BeautifulSoup(html, "html.parser") # this is (presumably) only in the main page for element in soup.findAll("h2", {"class": "section-heading"}): if element.a: url = element.a.get("href") if url not in self.visited_urls: self.url_queue.append(Utility.clean_url(url)) # in main page and appear as relevant articles for element in soup.findAll("a", {"class": "story-link"}): url = element.get("href") if url not in self.visited_urls: self.url_queue.append(Utility.clean_url(url))
def test_combine(self): td = Utility.get_tupledesc(2) tup = Tuple(td) tup.set_field(0, IntField(-1)) tup.set_field(1, IntField(0)) self.assertEqual(IntField(-1), tup.get_field(0)) self.assertEqual(IntField(0), tup.get_field(1)) tup.set_field(0, IntField(1)) tup.set_field(1, IntField(37)) self.assertEqual(IntField(1), tup.get_field(0)) self.assertEqual(IntField(37), tup.get_field(1))
def generate_delta_e(self, next_move, current_state, current_value): """Generate deltaE for Simulated Annealing Algorithm Args: next_move (dict(from, to)): next possible move current_state (State): current state current_value (float): current state value Returns: float: deltaE value """ current_state.board.move_pawn(next_move['from'], next_move['to'][0]) next_value = Utility.utility_function(current_state) current_state.board.move_pawn(next_move['to'][0], next_move['from']) return next_value - current_value
def predict(cls, text): """Receives an array of raw texts and returns the predicted response using the Greedy Search method. Example: text=['Who are you?] """ assert isinstance(text, list) tokens = tokenizer.convert_text_to_number(text) if cls._has_unrecognized_words(tokens): return ["Sorry, there is a word that I don't understand:\n"] else: response, indexes = Utility.predict( source_texts=text, model=encoder_decoder, tokenizer=tokenizer, device=DEVICE, max_prediction_len=MAX_PRED_LENGTH) return response[0]
def minimax(self, state, is_max, depth=0, alpha=float("-inf"), beta=float("inf")): """Minimax Algorithm for solving Halma Checker Parameters: state (State): game state is_max (Boolean): is maxing the objective value Returns: Tuple: Tuple of (best_move objective value, best_move) """ # Terminate if self.terminate(depth, state): return None, Utility.utility_function(state) # Recursive possible_moves = state.current_player_possible_moves() return self.search(is_max, possible_moves, state, depth, alpha, beta)
def scrape(file_name, html): """ Scrapes the given HTML. """ strainer = SoupStrainer(["span", "h1", "p"]) soup = BeautifulSoup(html, "html.parser", parse_only=strainer) try: author = soup.find("span", { "class": "byline-author" }).getText().title() date = Utility.clean_date( soup.find("time", { "class": "dateline" }).get("datetime")) title = soup.find("h1", {"id": "headline"}).getText() # appends article bodies content = [] for para in soup.findAll("p", {"class": "story-content"}): content.append(para.getText()) with open( os.path.join(settings.output_directory, file_name).replace(".html", ".json"), "w") as file: article = { "author": author, "content": content, "date": date, "title": title, } json.dump(article, file, indent=4, ensure_ascii=False) except AttributeError: pass
def __init__(self, db, const): self.db = db self.utility = Utility() self.CONST = const
logger.info('%s' % ('{:*^60}'.format(''))) # db config configParser = ConfigParser.RawConfigParser() configParser.read(r'config') config = { 'user': configParser.get('db-config', 'user'), 'password': configParser.get('db-config', 'password'), 'host': configParser.get('db-config', 'host'), 'database': configParser.get('db-config', 'database'), } dbs = db.DbHelper(config) qr = db.queries(dbs, const) util = Utility(opts.debug) util.setLogger(logger) if opts.dv is None and opts.loop is True: dvs = qr.getDvs(opts.userId) else: dvs = ((opts.seq, opts.dv),) cnt = 0 try: for dv in dvs: cnt = cnt + 1 seq = dv[0] dv = dv[1] params = () params = qr.getSetup(opts.userId, seq, dv)
# 1. Access to facilities # * Number of public basic health centers / 1000 inhabitants # * Number of beds / 1000 inhabitants # # 1. Access to medicine # * Number of pharmacies / 1000 inhabitants # %% [markdown] {"toc-hr-collapsed": false} # ## Access to professionals # %% [markdown] # ### Number of doctors / 1000 inhabitants # %% u_ndocs = Utility(name="ndocs", optimal_fit=True, data=data["Number of doctors"].values) u_ndocs.assess() u_ndocs.fit() # %% data.columns # %% u_ndocs.result.params.valuesdict() # %% [markdown] # ### Number of paramedical staff / 1000 inhabitants # %% u_nparam = Utility(name="nparam",
def __init__(self): self.utility = Utility() pin_number_prefixes = [" pin ", " pin/ ", " pin_ ", " cod ", "p/c ", " pinn cod ", " code ", " cd ", " pincode ", " cod ", "pin ", "pin,", " pinkod ", "pin-", " pino "] self.pin_number_prefixes = self.utility.reverse_list(sorted(list(set(pin_number_prefixes)), key=len))
class PinCode: def __init__(self): self.utility = Utility() pin_number_prefixes = [" pin ", " pin/ ", " pin_ ", " cod ", "p/c ", " pinn cod ", " code ", " cd ", " pincode ", " cod ", "pin ", "pin,", " pinkod ", "pin-", " pino "] self.pin_number_prefixes = self.utility.reverse_list(sorted(list(set(pin_number_prefixes)), key=len)) def update_pin_number(self, address_obj): location_mapper = PinLocationMapper() hilighted_pin_list = self.get_pin_code_hilighted(address_obj) if hilighted_pin_list is not None and len(hilighted_pin_list) > 0: hilighted_pin = hilighted_pin_list[0] pin = hilighted_pin.replace("*", "") address_obj.pin = pin address_obj.address = address_obj.address.replace(hilighted_pin, "").strip() pin_location = location_mapper.get_address_details(pin) if pin_location is not None and len(pin_location) > 0: # print(pin_location) state, district, block = pin_location.split(",") if state is not None and district is not None and block is not None: address_obj.state = state address_obj.district = district address_obj.block = block if pin is not None: address_obj.address = self.utility.white_space_cleaner(address_obj.address) + " Pin " + pin return def pad_pin_code(self, text_input, pad_word): text = text_input space = " " pin_regex_0 = "[<]\d{6}[>]" # |<334333>| pin_regex_1 = "[ ]\d{6}$" # | 334333| pin_regex_2 = "[ ]\d{6}[ ]" # | 334333 | pin_regex_3 = "[^0-9*]\d{6}[^0-9*]" # |n334333d| pin_regex_4 = "[ ]\d{4}[ ]\d{2}[ ]" # | 3343 33 | pin_regex_5 = "[^0-9]\d{3}[ ]\d{3}" # |334 333| pin_regex_6 = "[ ]\d{4}[ ]\d{2}$" # | 3343 33| pin_regex_7 = "[ ]\d{4}[ ]\d{2}[ ]" # | 3343 33 | pin_regex_8 = "[^0-9*]\d{6}[ ]" # |n3343 33 | pin_regex_9 = "[ ]\d{6}[^0-9*]" # |334333n| pin_regex_0_matches = re.findall(pin_regex_0, text) pin_regex_1_matches = re.findall(pin_regex_1, text) pin_regex_2_matches = re.findall(pin_regex_2, text) pin_regex_3_matches = re.findall(pin_regex_3, text) pin_regex_4_matches = re.findall(pin_regex_4, text) pin_regex_5_matches = re.findall(pin_regex_5, text) pin_regex_6_matches = re.findall(pin_regex_6, text) pin_regex_7_matches = re.findall(pin_regex_7, text) pin_regex_8_matches = re.findall(pin_regex_8, text) pin_regex_9_matches = re.findall(pin_regex_9, text) if len(pin_regex_0_matches) > 0: for match in set(pin_regex_0_matches): pin = match[1:-1] padded_match = (space + pad_word + pin + pad_word + space) return text.replace(match, padded_match) if len(pin_regex_1_matches) > 0: #print("match 1") for match in set(pin_regex_1_matches): padded_match = space + pad_word + match.replace(" ", "") + pad_word + space return text.replace(match, padded_match) if len(pin_regex_2_matches) > 0: #print("match 2") for match in set(pin_regex_2_matches): padded_match = space + pad_word + match.replace(" ", "") + pad_word + space return text.replace(match, padded_match) if len(pin_regex_3_matches) > 0: #print("match 3") for match in set(pin_regex_3_matches): first_char = match[0] last_char = match[-1] pin = match[1:-1] padded_match = (first_char + space + pad_word + pin + pad_word + space + last_char) text = text.replace(match, padded_match) return text if len(pin_regex_4_matches) > 0: #print("match 4") for match in set(pin_regex_4_matches): padded_match = space + pad_word + match.replace(" ", "") + pad_word + space text = text.replace(match, padded_match) return text if len(pin_regex_5_matches) > 0: #print("match 5") for match in set(pin_regex_5_matches): prefix = match[0] padded_match = prefix + space + pad_word + match.replace(" ", "") + pad_word + space text = text.replace(match, padded_match) return text if len(pin_regex_6_matches) > 0: #print("match 6") for match in set(pin_regex_6_matches): padded_match = space + pad_word + match.replace(" ", "") + pad_word + space text = text.replace(match, padded_match) return text if len(pin_regex_7_matches) > 0: #print("match 7") for match in set(pin_regex_7_matches): padded_match = space + pad_word + match.replace(" ", "") + pad_word + space text = text.replace(match, padded_match) return text if len(pin_regex_8_matches) > 0: #print("match 8") for match in set(pin_regex_8_matches): pin = match[1:-1] padded_match = space + pad_word + pin + pad_word + space text = text.replace(match, padded_match) return text if len(pin_regex_9_matches) > 0: #print("match 9") for match in set(pin_regex_9_matches): pin = match[1:-1] last_char = match[-1] padded_match = space + pad_word + pin + pad_word + space + last_char text = text.replace(match, padded_match) return text return text_input def pin_number_text_remover(self, text): if text is not None: address = text.lower() for prefix in self.pin_number_prefixes: if address.find(prefix) != -1 and prefix.find(",") != -1 and prefix.find("*") != -1: address = address.replace(prefix, ", *") if address.find(prefix) != -1 and prefix.find("*") != -1: address = address.replace(prefix, " *") if address.find(prefix) != -1: address = address.replace(prefix, " ") return address def get_pin_code_hilighted(self, address_obj): #print(address_obj.address) highlighted_pin_code_regex = "[*]\d{6}[*]" pin_codes = re.findall(highlighted_pin_code_regex, address_obj.address) #print(pin_codes) return list(set(pin_codes)) def pin_code_extender(self, text): #Bangalore regex_1 = r"[bB]angalore[ -]\d{2}[ ,]" matches_1 = re.findall(regex_1, text) if len(matches_1) > 0: for match in matches_1: result = re.sub(r"[^a-zA-Z0-9]", " ", match) pin_matches = re.findall(r"\d{2}", result) for short_pin in pin_matches: pin = "5600" + short_pin result = result.replace(short_pin, pin) text = text.replace(match, result) return text
def test_get_tupledesc(self): td = Utility.get_tupledesc(5) tup = Tuple(td) self.assertEqual(td, tup.tuple_desc)
def get_options(): parser = set_options() (options, args) = parser.parse_args() util = Utility() if options.fix is False and options.userId is '': print 'insert fix or userId options' print u'fix 또는 id 옵션을 설정하세요.' sys.exit() if options.fix: options.userId = 'system' if options.dv is '' or options.dv is None and \ options.loop is False or options.loop is None: util.printKeyValue('Insert DV code', '') sys.exit() # print options util.printLine() util.printKeyValue('debug', options.debug, open=False) util.printKeyValue('fix', options.fix, open=False) util.printKeyValue('userId', options.userId, open=False) util.printKeyValue('seq', options.seq, open=False) util.printKeyValue('dv', options.dv, open=False) util.printKeyValue('loop', options.loop, open=False) util.printKeyValue('shift', options.shift, open=False) util.printLine() return options
class ReadModule(): """ 생성자 ReadModule은 t0와 t1 사이의 데이터만 읽는다. """ def __init__(self, params): self.t0 = params['t0'] self.t1 = params['t1'] self.utility = Utility() self.params = params # 엑셀파일 읽기 def read_file(self, path): print '' print u"%s 파일 읽기 시작"%(path) workbook = xlrd.open_workbook(path) sheets = workbook.sheets() result = [] for sh in sheets: one_sheet_data = self.extract_from_sheet(workbook, sh) result.extend(one_sheet_data) print u"%s 파일 읽기 완료"%(path) return result # 시트에서 데이터 추출하기 def extract_from_sheet(self, book, sh, date_col=0, id_row=2, nm_row=3, unit_row=4, start_col=1, start_row=5): series_result = [] du = DateUtility() date_values = sh.col_values(date_col, start_rowx=start_row, end_rowx=sh.nrows) # 날짜 값 date_type = sh.col_types(date_col, start_rowx=start_row, end_rowx=sh.nrows) # 날짜 타입 # io_values = sh.col_values(date_col-1) date_result = [] for i in range(len(date_values)): if date_type[i] == 3: # 날짜 형식 date_tuple = xlrd.xldate_as_tuple(date_values[i], book.datemode) date_result.append(datetime.date(date_tuple[0], date_tuple[1], date_tuple[2])) elif date_type[i] == 2: date_str = str(int(date_values[i])) date_result.append(datetime.datetime.strptime(date_str, '%Y%m%d').date()) pass # 문자열일 경우 처리해 줘야 할듯 col_cnt = sh.row_len(id_row) for i in range(col_cnt)[start_col:]: # for i in range(col_cnt)[start_col:4]: io_type = sh.cell(id_row-1, i).value name = sh.cell(nm_row, i).value code = self.utility.convert_code(sh.cell(id_row, i).value) unit = sh.cell(unit_row, i).value series = Series(self.params) series.io_type = io_type series.code = code series.name = name series.group = unit series.value = sh.col_values(i, start_row) series.date = date_result series.data_cleansing(self.t0, self.t1) series.set_freq() # Full data 만 sheet list에 등록 du = DateUtility() # Full data 만 sheet list에 등록 if series.date[0] <= self.t0 and series.date[-1] >= self.t1: series_result.append(series) # if series.date[0] <= self.t0 and series.date[-1] >= du.subtract_months(self.t1, 4): # series_result.append(series) return series_result # column 개수 구하기 def column_len(sheet, index): col_values = sheet.col_values(index) col_len = len(col_values) for _ in takewhile(lambda x: not x, reversed(col_values)): col_len -= 1 return col_len
def insert_report(self, data): util = Utility() util.printKeyValue('in output', '', open=True) atime = datetime.datetime.now() self.insert_iv(data) util.printKeyValue(' iv Time diff', datetime.datetime.now() - atime, ' ', True, True) btime = datetime.datetime.now() self.insert_factor(data) util.printKeyValue(' factor Time diff', datetime.datetime.now() - btime) ctime = datetime.datetime.now() self.insert_factor_weight(data) util.printKeyValue(' factor_weight Time diff', datetime.datetime.now() - ctime) dtime = datetime.datetime.now() self.insert_factor_parent() util.printKeyValue(' factor parent Time diff', datetime.datetime.now() - dtime) etime = datetime.datetime.now() self.insert_warning_board_idx(data) util.printKeyValue(' index Time diff', datetime.datetime.now() - etime)
class queries(): def __init__(self, db, const): self.db = db self.utility = Utility() self.CONST = const def getDvs(self, id): dataTuples = self.db.exeData( self.CONST.QR_SELECT_ALL_DV % id) return dataTuples def getSetup(self, id, seq, dvcd): # 변수별 컬럼 값 ID_NM = 0 SEQ = 1 DV = 2 START_DT = 3 END_DT = 4 LEARN_DT = 5 NTS = 6 FILTER = 7 PCA = 8 LAG = 9 SCALING = 10 LAG_CUT = 11 SHIFT = 12 DIR = 13 THRESHOLD = 14 dataTuples = self.db.exeData( self.CONST.QR_SELECT_DV_SETUP % (dvcd, id, seq)) dbData = dataTuples[0] result = {} result['id_nm'] = dbData[ID_NM] result['seq'] = dbData[SEQ] result['nts_thres'] = dbData[NTS] result['t0'] = datetime.datetime.strptime(str(dbData[START_DT]) + '01', '%Y%m%d').date() result['t1'] = datetime.datetime.strptime(str(dbData[END_DT]) + '01', '%Y%m%d').date() result['t2'] = datetime.datetime.strptime(str(dbData[LEARN_DT]) + '01', '%Y%m%d').date() result['pca_thres'] = dbData[PCA] result['intv'] = int(dbData[LAG]) result['lag_cut'] = int(dbData[LAG_CUT]) result['scaling'] = dbData[SCALING] result['hp_filter'] = dbData[FILTER] result['dv'] = dbData[DV] result['dv_dir'] = dbData[DIR] result['thres_cut'] = 0.2 # .2 고정 result['dv_thres'] = dbData[THRESHOLD] result['shift'] = dbData[SHIFT] self.params = result return result def getDv(self, dv): result = [] dataTuples = self.db.exeData(self.CONST.QR_SELECT_DV % dv) dbData = self.extract_from_list(dataTuples) result.extend(dbData) return result def getITemsFromDV(self, dv): items = [] return items def getItems(self, id, seq, dvcd): items = self.db.exeData(self.CONST.QR_SELECT_ITEM % (dvcd)) itemCdSelect = [] itemNmSelect = [] pathSelect = [] dataSelect = [] cnt = 0 itemCdSelect.append("select '', ") itemNmSelect.append("select '', ") pathSelect.append("select 'TRD_DT', ") dataSelect.append("select concat(a.trd_dt,'01'), ") for item in items: itemCdSelect.append( "MAX(iF(a.item_cd = '" + item[0] + "', a.item_cd, null)) 'I'") itemNmSelect.append( "MAX(iF(a.item_cd = '" + item[0] + "', concat(a.item_nm, '_', a.unit), null)) ") pathSelect.append( "MAX(iF(a.item_cd = '" + item[0] + "', a.path, null)) ") dataSelect.append( "MAX(iF(a.item_cd = '" + item[0] + "', a.amount, null)) ") if cnt < len(items) - 1: itemCdSelect.append(', ') itemNmSelect.append(', ') pathSelect.append(', ') dataSelect.append(', ') cnt = cnt + 1 itemCdSelect.append( "from iwbs_ind_var_mast a, iwbs_indust_mast b " "where b.dv_cd = '" + dvcd + "' and a.item_cd = b.item_cd") itemNmSelect.append( "from iwbs_ind_var_mast a, iwbs_indust_mast b " "where b.dv_cd = '" + dvcd + "' and a.item_cd = b.item_cd") pathSelect.append( "from iwbs_ind_var_mast a, iwbs_indust_mast b " "where b.dv_cd = '" + dvcd + "' and a.item_cd = b.item_cd") dataSelect.append( "from iwbs_ind_var_data a, iwbs_indust_mast b " "where b.dv_cd = '" + dvcd + "' and a.item_cd = b.item_cd " "group by a.trd_dt") allSelect = [] allSelect.append(''.join(itemCdSelect)) allSelect.append(" union all ") allSelect.append(''.join(itemNmSelect)) allSelect.append(" union all ") allSelect.append(''.join(pathSelect)) allSelect.append(" union all ") allSelect.append(''.join(dataSelect)) result = [] dataTuples = self.db.exeData(''.join(allSelect)) dbData = self.extract_from_list(dataTuples) result.extend(dbData) return result def extract_from_list(self, data): series_result = [] date_result = [] du = DateUtility() date_col = 0 id_row = 0 nm_row = 1 unit_row = 2 start_col = 1 start_row = 3 date_values = du.getCol_values(data, date_col, start_row, len(data)) for i in range(len(date_values)): date_str = str(int(date_values[i])) date_result.append(datetime.datetime.strptime(date_str, '%Y%m%d').date()) pass col_cnt = len(data[id_row]) io_type = 'I' for i in range(col_cnt)[start_col:]: name = data[nm_row][i] code = self.utility.convert_code(data[id_row][i]) unit = data[unit_row][i] series = Series(self.params) series.io_type = io_type series.code = code series.name = name series.group = unit series.value = du.getCol_values(data, i, start_row, len(data)) series.date = date_result series.data_cleansing(self.params['t0'], self.params['t1']) series.set_freq() if len(series.date) > 0 and series.date[0] <= self.params['t0'] and series.date[-1] >= self.params['t1']: series_result.append(series) return series_result
def start(self): util = Utility(self.options.debug) util.setLogger(self.logger) iv_total = [] # 디비에서 독립변수 받기 atime = datetime.datetime.now() items = self.qr.getItems(self.options.userId, self.params['seq'], self.params['dv']) # 유저 셋팅 # 아이템 받기 iv_total.extend(items) # 디비에서 종속변수 받기 dv = self.qr.getDv(self.params['dv']) # debug 용 데이터 축소 # if options.debug: # iv_total = iv_total[:12] # print "length of iv_total is %s" % len(iv_total) util.printKeyValue(' GetItems Time diff', datetime.datetime.now() - atime) du = DateUtility() interpolated_time = datetime.datetime.now() # t0와 t1 월별날짜 리스트 계산 month_list_str, month_list_months = du.get_montly_span( self.params['t0'], self.params['t1']) # out of sample months month_list_str_out, month_list_months_out = du.get_montly_span( self.params['t0'], self.params['t2']) iv_total_out = copy.deepcopy(iv_total) iv_info_dict = {} iv_total_out_time = datetime.datetime.now() for iv in iv_total: iv.set_monthly_data() # 같은월에 여러 데이터중 최신 데이터만 # 내삽 iv.set_interpolated_data(month_list_months, month_list_str) iv_info_dict[iv.code] = {} iv_info_dict[iv.code]['group'] = iv.group util.printKeyValue(' interpolated Time diff', datetime.datetime.now() - interpolated_time) for iv in iv_total_out: iv.set_monthly_data() iv.set_interpolated_data(month_list_months_out, month_list_str_out) # -------------------------------------------------- util.printKeyValue(' iv_total_out Time diff', datetime.datetime.now() - iv_total_out_time) dv[0].set_monthly_data() dv[0].set_interpolated_data(month_list_months, month_list_str) dv_out = copy.deepcopy(dv) dv_out[0].set_monthly_data() dv_out[0].set_interpolated_data(month_list_months_out, month_list_str_out) # 월중 최신데이터만 선택, 내삽 완료. df_iv_time = datetime.datetime.now() df_iv = read.convert_series_list_to_dataframe(iv_total) # out of sample months df_iv_out = read.convert_series_list_to_dataframe(iv_total_out) # -------------------------------------------------- util.printKeyValue(' df_iv, df_iv_out Time diff', datetime.datetime.now() - df_iv_time) # 전처리 작업 구동 pp = PreProcessing() df_time = datetime.datetime.now() # ADF 테스트 후 차분 df_iv, df_iv_out = pp.get_adf_test_after_df(df_iv, df_iv_out, iv_info_dict) util.printKeyValue(' adf_test Time diff', datetime.datetime.now() - df_time) filter_time = datetime.datetime.now() # Hp Filter df_iv = pp.get_hp_filter(df_iv, self.params['hp_filter']) util.printKeyValue(' df_iv_filter Time diff', datetime.datetime.now() - filter_time) df_iv_time = datetime.datetime.now() # out of sample months ------------------------------------------------ df_iv_out = pp.get_hp_filter(df_iv_out, self.params['hp_filter']) util.printKeyValue(' df_iv_out_filter Time diff', datetime.datetime.now() - df_iv_time) # --------------------------------------------------------------------- df_dv_time = datetime.datetime.now() # 종속변수 df_dv = read.convert_series_list_to_dataframe(dv) df_dv_out = read.convert_series_list_to_dataframe(dv_out) df_dv = df_dv[1:].reset_index(drop=True) # 맨 앞 데이터 차분 df_dv_out = df_dv_out[1:].reset_index(drop=True) if int(self.params['scaling']) == 1: df_iv, df_iv_out = pp.scale_iv(df_iv, df_iv_out) df_iv['DV'] = df_dv[df_dv.columns[2]] df_iv_out['DV'] = df_dv_out[df_dv_out.columns[2]] # out of sample util.printKeyValue(' df_dv_out Time diff', datetime.datetime.now() - df_dv_time) nts_time = datetime.datetime.now() # nts 계산 nts_module = NtsCaldulator() dv_crisis_digit_list, dv_thres = \ nts_module.cal_nts_total( df_iv, iv_info_dict, self.params['intv'], self.params['thres_cut'], self.params['dv_thres'], self.params['lag_cut'], self.params['dv_dir'] ) # iv_info_dict 에 nts 관련 정보 적재 (2016.03.10) nts 계산에서 \ # 선행기간 내 위기식별 구간 제한 추가작업 lag_cut # nts_module.cal_nts_by_digit(df_iv, dv_crisis_digit_list) util.printKeyValue(' cal_nts_total Time diff', datetime.datetime.now() - nts_time) df_iv_digit_time = datetime.datetime.now() # nts 에 따른 thres와 digit 저장 df_iv_digit = nts_module.get_iv_sh_digit(df_iv, iv_info_dict, self.params['dv_thres'], self.params['dv_dir']) util.printKeyValue(' get_iv_sh_digit Time diff', datetime.datetime.now() - df_iv_digit_time) srt_time = datetime.datetime.now() srted = sorted(iv_info_dict.iteritems(), key=self.get_value, reverse=False) filtered = [s for s in srted if s[1]['nts'] < self.params['nts_thres']] util.printKeyValue(' sorted Time diff', datetime.datetime.now() - srt_time) factor_time = datetime.datetime.now() code_list = [] for f in filtered: code_list.append(f[0]) pca_module = PcaCalculator() y, wt, fracs, df_factor, df_factor_out = \ pca_module.run_cap( df_iv[code_list], df_iv_out[code_list], self.params['pca_thres'] ) factor_weight = {} factor_weight['col_list'] = df_iv[code_list].columns.tolist() factor_weight['weight'] = wt factor_weight['fracs'] = fracs # df_factor_yyyymm 출력용 df_factor_series = df_factor.copy() df_factor_series['YYYYMM'] = df_iv['YYYYMM'].tolist() df_factor_series['DV'] = df_iv['DV'].tolist() # df_factor_yyyymm 출력용 df_factor_series_out = df_factor_out.copy() df_factor_series_out['YYYYMM'] = df_iv['YYYYMM'].tolist() df_factor_series_out['DV'] = df_iv['DV'].tolist() factor_info_dict = {} for col in df_factor.columns: factor_info_dict[col] = {} # df_factor['DV'] = df_dv_sh[df_dv_sh.columns[2]] nts_module.cal_nts_total(df_factor_series, factor_info_dict, self.params['intv'], self.params['thres_cut'], self.params['dv_thres'], self.params['lag_cut'], self.params['dv_dir']) # (2016.03.10) nts 계산에서 선행기간 내 위기식별 구간 제한 추가작업 lag_cut for i in range(len(df_factor.columns.tolist())): factor_info_dict[df_factor.columns.tolist()[i]]['weight'] = \ factor_weight['fracs'][i] util.printKeyValue(' factor Time diff', datetime.datetime.now() - factor_time) idx_time = datetime.datetime.now() # 위기지수 계산 df_warning_idx = self.cal_warning_idx(factor_info_dict, df_factor_series) df_warning_idx_out = \ self.cal_warning_idx(factor_info_dict, df_factor_series_out) result = {} # result['params'] = params result['iv_raw'] = iv_total # result['iv_code'] = iv_code result['iv_info_dict'] = iv_info_dict result['df_iv'] = df_iv result['df_iv_digit'] = df_iv_digit result['factor_info_dict'] = factor_info_dict result['df_factor_yyyymm'] = df_factor_series result['df_warning_idx'] = df_warning_idx result['df_warning_idx_out'] = df_warning_idx_out result['dv_thres'] = dv_thres result['factor_weight'] = factor_weight util.printKeyValue(' cal idx Time diff', datetime.datetime.now() - idx_time) return result
def __init__(self, params): self.t0 = params['t0'] self.t1 = params['t1'] self.utility = Utility() self.params = params
def start(self): util = Utility(self.options.debug) util.setLogger(self.logger) iv_total = [] # 디비에서 독립변수 받기 atime = datetime.datetime.now() items = self.qr.getItems( self.options.userId, self.params['seq'], self.params['dv'] ) # 유저 셋팅 # 아이템 받기 iv_total.extend(items) # 디비에서 종속변수 받기 dv = self.qr.getDv(self.params['dv']) # debug 용 데이터 축소 # if options.debug: # iv_total = iv_total[:12] # print "length of iv_total is %s" % len(iv_total) util.printKeyValue( ' GetItems Time diff', datetime.datetime.now() - atime) du = DateUtility() interpolated_time = datetime.datetime.now() # t0와 t1 월별날짜 리스트 계산 month_list_str, month_list_months = du.get_montly_span( self.params['t0'], self.params['t1']) # out of sample months month_list_str_out, month_list_months_out = du.get_montly_span( self.params['t0'], self.params['t2']) iv_total_out = copy.deepcopy(iv_total) iv_info_dict = {} iv_total_out_time = datetime.datetime.now() for iv in iv_total: iv.set_monthly_data() # 같은월에 여러 데이터중 최신 데이터만 # 내삽 iv.set_interpolated_data(month_list_months, month_list_str) iv_info_dict[iv.code] = {} iv_info_dict[iv.code]['group'] = iv.group util.printKeyValue( ' interpolated Time diff', datetime.datetime.now() - interpolated_time) for iv in iv_total_out: iv.set_monthly_data() iv.set_interpolated_data(month_list_months_out, month_list_str_out) # -------------------------------------------------- util.printKeyValue( ' iv_total_out Time diff', datetime.datetime.now() - iv_total_out_time) dv[0].set_monthly_data() dv[0].set_interpolated_data(month_list_months, month_list_str) dv_out = copy.deepcopy(dv) dv_out[0].set_monthly_data() dv_out[0].set_interpolated_data(month_list_months_out, month_list_str_out) # 월중 최신데이터만 선택, 내삽 완료. df_iv_time = datetime.datetime.now() df_iv = read.convert_series_list_to_dataframe(iv_total) # out of sample months df_iv_out = read.convert_series_list_to_dataframe( iv_total_out) # -------------------------------------------------- util.printKeyValue( ' df_iv, df_iv_out Time diff', datetime.datetime.now() - df_iv_time) # 전처리 작업 구동 pp = PreProcessing() df_time = datetime.datetime.now() # ADF 테스트 후 차분 df_iv, df_iv_out = pp.get_adf_test_after_df(df_iv, df_iv_out, iv_info_dict) util.printKeyValue( ' adf_test Time diff', datetime.datetime.now() - df_time) filter_time = datetime.datetime.now() # Hp Filter df_iv = pp.get_hp_filter(df_iv, self.params['hp_filter']) util.printKeyValue( ' df_iv_filter Time diff', datetime.datetime.now() - filter_time) df_iv_time = datetime.datetime.now() # out of sample months ------------------------------------------------ df_iv_out = pp.get_hp_filter(df_iv_out, self.params['hp_filter']) util.printKeyValue( ' df_iv_out_filter Time diff', datetime.datetime.now() - df_iv_time) # --------------------------------------------------------------------- df_dv_time = datetime.datetime.now() # 종속변수 df_dv = read.convert_series_list_to_dataframe(dv) df_dv_out = read.convert_series_list_to_dataframe(dv_out) df_dv = df_dv[1:].reset_index(drop=True) # 맨 앞 데이터 차분 df_dv_out = df_dv_out[1:].reset_index(drop=True) if int(self.params['scaling']) == 1: df_iv, df_iv_out = pp.scale_iv(df_iv, df_iv_out) df_iv['DV'] = df_dv[df_dv.columns[2]] df_iv_out['DV'] = df_dv_out[df_dv_out.columns[2]] # out of sample util.printKeyValue( ' df_dv_out Time diff', datetime.datetime.now() - df_dv_time) nts_time = datetime.datetime.now() # nts 계산 nts_module = NtsCaldulator() dv_crisis_digit_list, dv_thres = \ nts_module.cal_nts_total( df_iv, iv_info_dict, self.params['intv'], self.params['thres_cut'], self.params['dv_thres'], self.params['lag_cut'], self.params['dv_dir'] ) # iv_info_dict 에 nts 관련 정보 적재 (2016.03.10) nts 계산에서 \ # 선행기간 내 위기식별 구간 제한 추가작업 lag_cut # nts_module.cal_nts_by_digit(df_iv, dv_crisis_digit_list) util.printKeyValue( ' cal_nts_total Time diff', datetime.datetime.now() - nts_time) df_iv_digit_time = datetime.datetime.now() # nts 에 따른 thres와 digit 저장 df_iv_digit = nts_module.get_iv_sh_digit(df_iv, iv_info_dict, self.params['dv_thres'], self.params['dv_dir']) util.printKeyValue( ' get_iv_sh_digit Time diff', datetime.datetime.now() - df_iv_digit_time) srt_time = datetime.datetime.now() srted = sorted(iv_info_dict.iteritems(), key=self.get_value, reverse=False) filtered = [s for s in srted if s[1]['nts'] < self.params['nts_thres']] util.printKeyValue( ' sorted Time diff', datetime.datetime.now() - srt_time) factor_time = datetime.datetime.now() code_list = [] for f in filtered: code_list.append(f[0]) pca_module = PcaCalculator() y, wt, fracs, df_factor, df_factor_out = \ pca_module.run_cap( df_iv[code_list], df_iv_out[code_list], self.params['pca_thres'] ) factor_weight = {} factor_weight['col_list'] = df_iv[code_list].columns.tolist() factor_weight['weight'] = wt factor_weight['fracs'] = fracs # df_factor_yyyymm 출력용 df_factor_series = df_factor.copy() df_factor_series['YYYYMM'] = df_iv['YYYYMM'].tolist() df_factor_series['DV'] = df_iv['DV'].tolist() # df_factor_yyyymm 출력용 df_factor_series_out = df_factor_out.copy() df_factor_series_out['YYYYMM'] = df_iv['YYYYMM'].tolist() df_factor_series_out['DV'] = df_iv['DV'].tolist() factor_info_dict = {} for col in df_factor.columns: factor_info_dict[col] = {} # df_factor['DV'] = df_dv_sh[df_dv_sh.columns[2]] nts_module.cal_nts_total( df_factor_series, factor_info_dict, self.params['intv'], self.params['thres_cut'], self.params['dv_thres'], self.params['lag_cut'], self.params['dv_dir'] ) # (2016.03.10) nts 계산에서 선행기간 내 위기식별 구간 제한 추가작업 lag_cut for i in range(len(df_factor.columns.tolist())): factor_info_dict[df_factor.columns.tolist()[i]]['weight'] = \ factor_weight['fracs'][i] util.printKeyValue( ' factor Time diff', datetime.datetime.now() - factor_time) idx_time = datetime.datetime.now() # 위기지수 계산 df_warning_idx = self.cal_warning_idx(factor_info_dict, df_factor_series) df_warning_idx_out = \ self.cal_warning_idx(factor_info_dict, df_factor_series_out) result = {} # result['params'] = params result['iv_raw'] = iv_total # result['iv_code'] = iv_code result['iv_info_dict'] = iv_info_dict result['df_iv'] = df_iv result['df_iv_digit'] = df_iv_digit result['factor_info_dict'] = factor_info_dict result['df_factor_yyyymm'] = df_factor_series result['df_warning_idx'] = df_warning_idx result['df_warning_idx_out'] = df_warning_idx_out result['dv_thres'] = dv_thres result['factor_weight'] = factor_weight util.printKeyValue( ' cal idx Time diff', datetime.datetime.now() - idx_time) return result