def add_dicts_value(dict1, dict2): if len(dict1) != len(dict2): logger1.error("Emotion list do not have the same length!") sys.exit() """Adding the values of the 2 dicts with the same key""" for key, value in dict2.items(): dict1[key] += dict2[key] return dict1
def save_raw_data(self, save_folder, is_f=True, is_prediction=False): save_count = 0 for sample, t_feature_array in self.a_share_samples_t_dict.items(): feature_array_list = [] # (0.) add technical features feature_array_list.append(t_feature_array) # (1.) add fundamental features if is_f: is_sample_exist = self.a_share_samples_f_dict.get(sample) if is_sample_exist is None: logger1.error( 'sample {} does not have any fundamental data'.format( sample)) continue f_feature_array = self.a_share_samples_f_dict[sample] feature_array_list.append(f_feature_array) # concatenate all features feature_array_final = np.array([]) for feature_array in feature_array_list: feature_array_final = self.integrate_tech_fundamental_feature( feature_array_final, feature_array) # convert every feature to float feature_array_final = feature_array_final.astype(float) # feature_list_final = list(feature_array_final) attribitors = self.t_attributors + self.f_attributors if len(attribitors) != len(feature_list_final): logger1.error( 'sample: {}, feature_list_final and attribitors are not the same length! {}, {}' .format(sample, len(attribitors), len(feature_list_final))) continue save_zip = zip(attribitors, feature_list_final) # save file save_name = sample + '.csv' save_path = os.path.join(save_folder, save_name) with open(save_path, 'w', encoding='utf-8') as f: for attribitor, feature_value in save_zip: f.write(str(attribitor) + ',' + str(feature_value) + '\n') save_count += 1 print("Save {} samples to {} succesfully!".format( save_count, save_folder))
def get_emotion_dict(self, file_path): # 0f85da3c79394b2887291025758afa94, e1fc0e40c7464cebbc63317c3b0f5b26 headers = { # Basic Authorization Sample 'Content-type': 'application/octet-stream', 'Ocp-Apim-Subscription-Key': 'e1fc0e40c7464cebbc63317c3b0f5b26', } params = urllib.parse.urlencode({ ## Specify your subscription key # 'subscription-key': '', ## Specify values for optional parameters, as needed # 'analyzesFaceLandmarks': 'false', # 'analyzesAge': 'false', # 'analyzesGender': 'false', # 'analyzesHeadPose': 'false', }) file = open(file_path, "rb").read() try: conn = http.client.HTTPSConnection( 'westus.api.cognitive.microsoft.com') conn.request("POST", "/emotion/v1.0/recognize?%s" % params, file, headers) print("send request") response = conn.getresponse() data = response.read().decode('utf-8') print("data: ", data) json_obj = json.loads(data)[0] conn.close() except IndexError: logger1.info("{} does not contain any faces".format(file_path)) return None except KeyError: logger1.info( "{} key Error! Maybe too big or too small".format(file_path)) return None except: logger1.error("Unexpected error:", sys.exc_info()[0]) raise return None return json_obj
def get_text_emotion_dict(self): texts_name_list = os.listdir(self.text_folder_path) texts_path_list = [ os.path.join(self.text_folder_path, x) for x in texts_name_list ] for text_file in texts_path_list: print("processing {}....".format(text_file)) with open(text_file, 'r', encoding='utf-8') as f: date_str = re.findall(r'([0-9]+-[0-9]+-[0-9]+)#', f.name)[0] #print ("date_str :", date_str) date_of_text_temp = time.strptime(date_str, '%Y-%m-%d') date_of_text = datetime.datetime(*date_of_text_temp[:3]) date_object = datetime.date(year=date_of_text.year, month=date_of_text.month, day=date_of_text.day) text_content_list = f.readlines() text_content = '.'.join(text_content_list) text_content_re_list = re.findall(r'[A-Za-z]+', text_content) text_content = '.'.join(text_content_re_list)[0:80000] #print ('text_content: ', text_content) request_dict = {} request_dict['language'] = "english" request_dict['text'] = text_content response = requests.post( "https://japerk-text-processing.p.mashape.com/sentiment/", headers={ "X-Mashape-Key": "muMV4DdXyqmsh6hEQIryzApEFo4bp14Nb8ojsnQZdTCaEAUMxo", "Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json" }, data=request_dict) print("status_code: ", response.status_code) response_dict = response.json()['probability'] response_dict['emotion_value'] = response_dict[ 'pos'] - response_dict['neg'] #print(response_dict) if self.date_text_emotion_dict[date_object]: logger1.error("{} has mutilple copies".format(date_object)) self.date_text_emotion_dict[date_object] = response_dict # add raw txt data to dict and create new file self.raw_data_dict[date_object].append(text_content)
def input_list_length_check(p1_chbits, p2_chbits): # check the length of both parents if len(p1_chbits) != len(p2_chbits): logger1.error( "The length of the input parents for the crossover is not equal!!" ) logger1.error("Error parent list: ", p1_chbits) logger1.error("Error parent list: ", p2_chbits) sys.exit(0)
def get_photo_emotion_dict(self): def add_dicts_value(dict1, dict2): if len(dict1) != len(dict2): logger1.error("Emotion list do not have the same length!") sys.exit() """Adding the values of the 2 dicts with the same key""" for key, value in dict2.items(): dict1[key] += dict2[key] return dict1 # ::: get_photo_emotion_dict photo_folder_list = os.listdir(self.face_folder_path) photo_folder_list = [ os.path.join(self.face_folder_path, x) for x in photo_folder_list ] #print ("photo_folder_list", photo_folder_list) for photo_file_path in photo_folder_list: # days_ago = f = open(photo_file_path, 'rb') # Return Exif tags tags = exifread.process_file(f) try: date_of_photo = tags['EXIF DateTimeDigitized'] except KeyError: logger1.error( "photo in {} has no meta data of digitized time!".format( photo_file_path)) continue # convert to str, date_of_photo = str(date_of_photo) # convert str to date object # date_of_photo = 2017:02:01 date_of_photo = re.findall(r'([0-9]+:[0-9]+:[0-9]+)', date_of_photo)[0] date_of_photo_temp = time.strptime(date_of_photo, '%Y:%m:%d') date_of_photo = datetime.datetime(*date_of_photo_temp[:3]) date_of_photo = datetime.date(year=date_of_photo.year, month=date_of_photo.month, day=date_of_photo.day) print("date_of_photo", date_of_photo, type(date_of_photo)) photo_emotion_dict = self.get_emotion_dict(photo_file_path) if photo_emotion_dict: #print ("photo_emotion_dict: ", photo_emotion_dict) photo_emotion_dict = photo_emotion_dict['scores'] else: continue #pp.pprint(photo_emotion_dict) #pp.pprint(type(photo_emotion_dict)) if self.date_photo_emotion_dict[date_of_photo]['dict']: self.date_photo_emotion_dict[date_of_photo][ 'dict'] = add_dicts_value( self.date_photo_emotion_dict[date_of_photo]['dict'], photo_emotion_dict) else: self.date_photo_emotion_dict[date_of_photo][ 'dict'] = photo_emotion_dict self.date_photo_emotion_dict[date_of_photo]['dict_num'] += 1 # compute the average of date_photo_emotion_dict for date, date_dict in self.date_photo_emotion_dict.items(): dict_num = date_dict['dict_num'] for emotion, emotion_value in self.date_photo_emotion_dict[date][ 'dict'].items(): emotion_value /= dict_num self.date_photo_emotion_dict[date]['dict'][emotion] = float( "{:.3f}".format(emotion_value))
def read_tech_history_data(self, start_date, is_prediction=False): # clear self.a_share_samples_t_dict = collections.defaultdict(lambda: 0) # start_date_temp = time.strptime(start_date, '%Y-%m-%d') start_date_obj = datetime.datetime(*start_date_temp[:3]).date() today_obj = datetime.datetime.today().date() today = datetime.datetime.today().strftime("%Y-%m-%d") t_attributors_set = set( ts.get_k_data("600883", start="2017-05-09", ktype='W').keys()) t_attributors_set -= {'code', 'date'} t_attributors_set.add('priceChange') t_attributors_set.add('candleLength') t_attributors_set.add('candlePos') t_attributors = sorted(list(t_attributors_set)) self.t_attributors = t_attributors stock_list = list(self.stock_set)[:] is_close_price_exist = True for stock_id in stock_list: fund_dict = ts.get_k_data(stock_id, start=start_date, ktype='W').to_dict() # date_list: ['2017-05-05', '2017-05-12', '2017-05-19'] try: # date_items: [(29, '2016-08-05'), (30, '2016-08-12'), (31, '2016-08-19'), ...] date_items = sorted(list(fund_dict['date'].items()), key=lambda x: x[0]) #print ("date_items: ", date_items) except KeyError: logger1.error("{} stock has no key data".format(stock_id)) continue for i, (id, date_str) in enumerate(date_items): if i > len(date_items) - 3 and is_prediction is False: print( "Skip {} on {} because of reaching the end. The data of the rest date " "cannot be fully presented".format(id, date_str)) continue # # ====================================================================================================== # # TODO # # DATA CHECK FOR NEXT WEEK AND NEXT NEXT WEEK, BUT IT'S HARD TO ACHIEVE BECAUSE THE VARITIES OF HOLIDAIES # # ====================================================================================================== # # get the date_str for next next week # date_temp = time.strptime(date_str, '%Y-%m-%d') # date_obj = datetime.datetime(*date_temp[:3]) # delta_14 = datetime.timedelta(days=14) # delta_7 = datetime.timedelta(days=7) # date_obj_nw = date_obj + delta_7 # date_nw_str = date_obj_nw.strftime("%Y-%m-%d") # date_obj_nnw = date_obj + delta_14 # date_nnw_str = date_obj_nnw.strftime("%Y-%m-%d") # # # # # check next week's data # if date_nw_str != date_items[i + 1][1]: # # print ("date_nw_str: ", date_nw_str) # # print ("date_items[i + 1][1]: ", date_items[i + 1][1]) # # sys.exit() # logger1.error("{} stock has no tech data on {} for next week".format(stock_id, date_nnw_str)) # continue # # # # # check next next week's data # if date_nnw_str != date_items[i + 2][1]: # logger1.error("{} stock has no tech data on {} for next next week".format(stock_id, date_nnw_str)) # continue # # # # ====================================================================================================== feature_list = [] for attributor in t_attributors: # for pricechange if attributor == 'priceChange' and is_prediction is False: nw_open = fund_dict['open'][date_items[i + 1][0]] nnw_open = fund_dict['open'][date_items[i + 2][0]] priceChange = "{:.5f}".format( (nnw_open - nw_open) / nw_open) # # price change for the next week's close # close_price = fund_dict['close'][id] # close_price_next_week = fund_dict['close'][date_items[i + 1][0]] # priceChange = "{:.5f}".format((close_price_next_week - close_price) / close_price) # # feature_list.append(priceChange) elif attributor == 'priceChange' and is_prediction is True: priceChange = "nan" feature_list.append(priceChange) elif attributor == 'candleLength': close_price = fund_dict['close'][id] open_price = fund_dict['open'][id] high_price = fund_dict['high'][id] low_price = fund_dict['low'][id] candle_length = "{:.5f}".format( abs((close_price - open_price) / (high_price - low_price))) feature_list.append(candle_length) elif attributor == 'candlePos': close_price = fund_dict['close'][id] open_price = fund_dict['open'][id] high_price = fund_dict['high'][id] low_price = fund_dict['low'][id] price = max(close_price, open_price) candle_pos = "{:.5f}".format( abs((high_price - price) / (high_price - low_price))) feature_list.append(candle_pos) else: # for other attributors feature_list.append(fund_dict[attributor][id]) feature_array = np.array(feature_list) sample_name = date_str + '_' + stock_id self.a_share_samples_t_dict[sample_name] = feature_array print("saving {} stock t features".format(stock_id)) print("t_attributors: {}".format(t_attributors)) print("a_share_samples_t_dict: {}".format( self.a_share_samples_t_dict.values())) print("a_share_samples_t_dict_value: {}".format( list(self.a_share_samples_t_dict.values())[0]))
def feature_engineering(self, input_folder, save_folder, keep_stock_ids_path=None): if keep_stock_ids_path: keep_stock_ids_list = [] with open(keep_stock_ids_path, 'r') as f: for line in f: keep_stock = line.strip() keep_stock_ids_list.append(keep_stock) file_name_list = os.listdir(input_folder) file_path_list = [ os.path.join(input_folder, file_name) for file_name in file_name_list ] successful_save_count = 0 original_data_count = len(file_name_list) for i, file_path in enumerate(file_path_list): file_name = file_name_list[i] stock_id = re.findall(r'_([0-9]+).csv', file_name)[0] # filter stock ids if keep_stock_ids_path: if stock_id not in keep_stock_ids_list: continue # date = re.findall(r'([0-9]+-[0-9]+-[0-9]+)_', file_name)[0] # find the data of the previous friday date_obj_temp = time.strptime(date, '%Y-%m-%d') date_obj = datetime.datetime(*date_obj_temp[:3]) # find the file for the previous week for the calculation of certain features, # the day gap does not necessary to be 7 days previous_week_date_full_path = '' pre_f_day_range = (7, 13) for days in range(pre_f_day_range[0], pre_f_day_range[1]): previous_friday_obj = date_obj - datetime.timedelta(days=days) previous_friday_str = previous_friday_obj.strftime("%Y-%m-%d") previous_friday_full_path = previous_friday_str + '_' + stock_id + '.csv' previous_friday_full_path = os.path.join( input_folder, previous_friday_full_path) try: open(previous_friday_full_path, 'r', encoding='utf-8') previous_week_date_full_path = previous_friday_full_path break except FileNotFoundError: continue if not previous_week_date_full_path: logger1.error( "{} cannot find the previous week's data within 13 days". format(file_name)) continue else: with open(previous_week_date_full_path, 'r', encoding='utf-8') as f: previous_f_feature_pair_dict = {} for line in f: line_list = line.split(',') feature_name = line_list[0] feature_value = float(line_list[1].strip()) previous_f_feature_pair_dict[ feature_name] = feature_value # feature_pair_dict = {} with open(file_path, 'r', encoding='utf-8') as f: for line in f: line_list = line.split(',') feature_name = line_list[0] feature_value = float(line_list[1].strip()) feature_pair_dict[feature_name] = feature_value # =================================================================================== # add features # =================================================================================== # (1.) open change pre_f = previous_f_feature_pair_dict['open'] f = feature_pair_dict['open'] feature_pair_dict['openChange'] = "{:.5f}".format( (f - pre_f) / pre_f) # ----------------------------------------------------------------------------------- # (2.) close change pre_f = previous_f_feature_pair_dict['close'] f = feature_pair_dict['close'] feature_pair_dict['closeChange'] = "{:.5f}".format( (f - pre_f) / pre_f) # ----------------------------------------------------------------------------------- # (3.) high change pre_f = previous_f_feature_pair_dict['high'] f = feature_pair_dict['high'] feature_pair_dict['highChange'] = "{:.5f}".format( (f - pre_f) / pre_f) # ----------------------------------------------------------------------------------- # (4.) low change pre_f = previous_f_feature_pair_dict['low'] f = feature_pair_dict['low'] feature_pair_dict['lowChange'] = "{:.5f}".format( (f - pre_f) / pre_f) # ----------------------------------------------------------------------------------- # (5.) volume change pre_f = previous_f_feature_pair_dict['volume'] f = feature_pair_dict['volume'] feature_pair_dict['volumeChange'] = "{:.5f}".format( (f - pre_f) / pre_f) # ----------------------------------------------------------------------------------- # (6.) open close change open_price = feature_pair_dict['open'] close_price = feature_pair_dict['close'] open_close_change = (close_price - open_price) / open_price feature_pair_dict['openCloseChange'] = "{:.5f}".format( open_close_change) # ----------------------------------------------------------------------------------- # (7.) low high change low_price = feature_pair_dict['low'] high_price = feature_pair_dict['high'] low_high_change = (high_price - low_price) / low_price feature_pair_dict['lowHighChange'] = "{:.5f}".format( low_high_change) # ********************************************************************************************** # FUNDAMENTALS # ********************************************************************************************** FUNDAMENTAL_ATTRIBUTOR_SET = {'pb', 'pe'} for attritubtor in FUNDAMENTAL_ATTRIBUTOR_SET: pre = previous_f_feature_pair_dict[attritubtor] this_week = feature_pair_dict[attritubtor] new_attributor_name = attritubtor + 'Change' try: feature_pair_dict[new_attributor_name] = "{:.6f}".format( (this_week - pre) / pre) except ZeroDivisionError: set_value = "1.0" feature_pair_dict[new_attributor_name] = set_value logger1.error( "New attributor {} has ZeroDivisionError! attritubtor: {}, temporal set value: {}" .format(os.path.basename(previous_week_date_full_path), new_attributor_name, set_value)) # ********************************************************************************************** # =================================================================================== # =================================================================================== # delete features: close, high, low, open # =================================================================================== # delete_features_set = {'close', 'high', 'low', 'open', 'timeToMarket'} delete_features_set = { 'close', 'high', 'low', 'open', 'timeToMarket', 'liquidAssets', 'fixedAssets', 'reserved', 'reservedPerShare', 'esp', 'bvps', 'pb', 'undp', 'perundp', 'holders', 'totals', 'totalAssets', 'outstanding' } for feature_name in delete_features_set: feature_pair_dict.pop(feature_name) # =================================================================================== # write the feature engineered file to folder file_name = file_name.replace('csv', 'txt') save_file_path = os.path.join(save_folder, file_name) with open(save_file_path, 'w', encoding='utf-8') as f: feature_pair_list = [] feature_pair_tuple_list = sorted(list( feature_pair_dict.items()), key=lambda x: x[0]) for feature_pair in feature_pair_tuple_list: feature_pair_list.append(feature_pair[0]) feature_pair_list.append(feature_pair[1]) feature_pair_list = [str(x) for x in feature_pair_list] feature_pair_str = ','.join(feature_pair_list) f.write(feature_pair_str) successful_save_count += 1 print( "Succesfully engineered {} raw data! original count: {}, delete {} files" .format(successful_save_count, original_data_count, original_data_count - successful_save_count))
def read_fundamental_data(self, start_date, is_filter_new_stock=False): # clear self.a_share_samples_f_dict = collections.defaultdict(lambda: 0) # start_date_temp = time.strptime(start_date, '%Y-%m-%d') start_date_obj = datetime.datetime(*start_date_temp[:3]).date() today_obj = datetime.datetime.today().date() today = datetime.datetime.today().strftime("%Y-%m-%d") # manually type f_attributors_set = { 'holders', 'undp', 'gpr', 'pb', 'industry', 'bvps', 'timeToMarket', 'rev', 'perundp', 'fixedAssets', 'name', 'reservedPerShare', 'totals', 'outstanding', 'liquidAssets', 'profit', 'pe', 'reserved', 'npr', 'area', 'totalAssets', 'esp' } # # change the date if time out #f_attributors_set = set(ts.get_stock_basics(date = "2017-05-26").to_dict().keys()) # filter_set = {'name', 'industry', 'area'} f_attributors_set = f_attributors_set - filter_set f_attributors = sorted(list(f_attributors_set)) self.f_attributors = f_attributors for single_date in daterange(start_date_obj, today_obj): temp_stock_feature_dict = collections.defaultdict(lambda: []) temp_stock_feature_dict_key_pop_set = set( ) # for filtering the new stocks # if it is not friday, skip! if single_date.weekday() != 4: continue date_str = single_date.strftime("%Y-%m-%d") try: print("date_str: ", date_str) ts_temp = ts.get_stock_basics(date=date_str) if ts_temp is None: logger1.error("{} not found any data!".format(date_str)) continue fund_dict = ts_temp.to_dict() except urllib.error.HTTPError: logger1.error("{} not found any data!".format(date_str)) continue for key, stock_key_value_dict in sorted(fund_dict.items()): # filter name,industry,,area, if key in filter_set: continue # for stock_id, value in stock_key_value_dict.items(): if is_filter_new_stock: if key == "timeToMarket": timeToMarket = str(value) try: date_temp = time.strptime( timeToMarket, '%Y%m%d') except ValueError: logger1.error( "{} has invalid timeToMarket value!". format(stock_id)) temp_stock_feature_dict[stock_id].append( (key, value)) continue date_obj = datetime.datetime(*date_temp[:3]).date() # set the threshold for new stock delta = datetime.timedelta(days=28) # date_gap = single_date - date_obj if date_gap <= delta: print( "stock_id: {} is new stock for {}, release date: {}" .format(stock_id, single_date, timeToMarket)) temp_stock_feature_dict_key_pop_set.add( stock_id) temp_stock_feature_dict[stock_id].append((key, value)) # filter new stocks if is_filter_new_stock: for stock_id in temp_stock_feature_dict_key_pop_set: temp_stock_feature_dict.pop(stock_id, 'None') # for stock_id, feature_list in temp_stock_feature_dict.items(): feature_list = sorted(feature_list, key=lambda x: x[0]) feature_value_list = [x[1] for x in feature_list] feature_array = np.array(feature_value_list) sample_name = date_str + '_' + stock_id # save samples self.a_share_samples_f_dict[sample_name] = feature_array print("saving {}'s stock feature to a_share_samples_f_dict".format( single_date)) print("f_attributors: {}".format(f_attributors)) print("a_share_samples_f_dict_value: {}".format( list(self.a_share_samples_f_dict.values())[0]))