def run(): args = get_args() train_set = read_data(data_folder / args.train) test_set = read_data(data_folder / args.test, col_names=("tweet_id", "text", "q1_label")) regular_solution = Naive_Bayes(train_set, test_set, ['yes', 'no'], False) filtered_solution = Naive_Bayes(train_set, test_set, ['yes', 'no'], True) output_trace(output_folder / "trace_NB-BOW-OV.txt", regular_solution) output_trace(output_folder / "trace_NB-BOW-FV.txt", filtered_solution) evaluate(output_folder / "eval_NB-BOW-OV.txt", regular_solution, "yes", "no") evaluate(output_folder / "eval_NB-BOW-FV.txt", filtered_solution, "yes", "no") #using sanitized input train_set_sanitized = sanitize(train_set) test_set_sanitized = sanitize(test_set) sanitized_solution = Naive_Bayes(train_set_sanitized, test_set_sanitized, ['yes', 'no'], False) output_trace(output_folder / "trace_NB-BOW-OV_sanitized.txt", sanitized_solution) evaluate(output_folder / "eval_NB-BOW-OV_sanitized.txt", sanitized_solution, "yes", "no")
def test_string_sanitizer_removes_spaces(self): # Arrange expected = "00201485672" # Act result = sanitize("0 0201485672") # Assert self.assertEqual(expected, result)
def __init__( self, text, sanitize=False, permitted_tags=[ 'a', 'b', 'blockquote', 'br/', 'i', 'li', 'ol', 'ul', 'p', 'cite', 'code', 'pre', 'img/', ], allowed_attributes={ 'a': ['href', 'title'], 'img': ['src', 'alt'], 'blockquote': ['type'] }, ): if sanitize: text = sanitizer.sanitize(text, permitted_tags, allowed_attributes) if isinstance(text, unicode): text = text.encode('utf8', 'xmlcharrefreplace') elif not isinstance(text, str): text = str(text) self.text = text
def test_string_sanitizer_removes_dashes(self): # Arrange expected = "9780596809485" # Act result = sanitize("978-0596809485") # Assert self.assertEqual(expected, result)
def __init__( self, text, sanitize=False, permitted_tags=[ 'a', 'b', 'blockquote', 'br/', 'i', 'li', 'ol', 'ul', 'p', 'cite', 'code', 'pre', 'img/', ], allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt' ], 'blockquote': ['type']}, ): if sanitize: text = sanitizer.sanitize(text, permitted_tags, allowed_attributes) if isinstance(text, unicode): text = text.encode('utf8', 'xmlcharrefreplace') elif not isinstance(text, str): text = str(text) self.text = text
def load_dataframe(filename): """ input: filename of pickle dataframe ready_for_R csv""" if os.path.isfile(const.DATAPATH + filename + ".pkl"): return pd.read_pickle(const.DATAPATH + filename + ".pkl") else: logger.error("NO SANITIZED PICKLE: " + const.DATAPATH + filename) logger.error("SANITIZE DATA FIRST") df = sanitizer.sanitize() return df
def get_coach_data(path): try: with open(path) as reader: data = reader.readline() data_split = data.strip().split(',') athlete_data = Athlete(data_split.pop(0), data_split.pop(0), sorted(set([sanitize(t) for t in data_split]))[0:3]) return athlete_data except IOError as ioerror: print('File Error: ' + str(ioerror)) return(None)
def main(): # Setup logging logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s', level=logging.INFO) divider = '-' * 30 # Login to database logging.info(divider + " DATABASE LOGIN " + divider) db = MongoClient(os.environ.get("TRITON_ANALYTICS_MONGODB")).get_database( "tritonanalytics") # Download recent analytics logging.info(divider + " DOWNLOAD PROCESS " + divider) download(os.environ.get("TRITON_ANALYTICS_FBID"), os.environ.get("TRITON_ANALYTICS_FBTOKEN")) # Export all stored analytics logging.info(divider + " EXPORT PROCESS " + divider) export_to(db) # Sanitize database logging.info(divider + " SANITIZATION " + divider) sanitize(db)
def __init__( self, text, sanitize = False, permitted_tags = [ 'a', 'b', 'blockquote', 'br/', 'i', 'li', 'ol', 'ul', 'p', 'cite', 'code', 'pre', 'img/', ], allowed_attributes = { 'a': ['href', 'title'], 'img': ['src', 'alt'], 'blockquote': ['type'] }, ): """ :param text: the XML text :param sanitize: sanitize text using the permitted tags and allowed attributes (default False) :param permitted_tags: list of permitted tags (default: simple list of tags) :param allowed_attributes: dictionary of allowed attributed (default for A, IMG and BlockQuote). The key is the tag; the value is a list of allowed attributes. """ if sanitize: text = sanitizer.sanitize(text, permitted_tags, allowed_attributes) if isinstance(text, unicode): text = text.encode('utf8', 'xmlcharrefreplace') elif not isinstance(text, str): text = str(text) self.text = text
def __init__( self, text, sanitize=False, permitted_tags=[ 'a', 'b', 'blockquote', 'br/', 'i', 'li', 'ol', 'ul', 'p', 'cite', 'code', 'pre', 'img/', ], allowed_attributes={ 'a': ['href', 'title'], 'img': ['src', 'alt'], 'blockquote': ['type'] }, ): """ :param text: the XML text :param sanitize: sanitize text using the permitted tags and allowed attributes (default False) :param permitted_tags: list of permitted tags (default: simple list of tags) :param allowed_attributes: dictionary of allowed attributed (default for A, IMG and BlockQuote). The key is the tag; the value is a list of allowed attributes. """ if sanitize: text = sanitizer.sanitize(text, permitted_tags, allowed_attributes) if isinstance(text, unicode): text = text.encode('utf8', 'xmlcharrefreplace') elif not isinstance(text, str): text = str(text) self.text = text
def top3(self): return sorted(set([sanitize(t) for t in self.times]))[0:3]
with open('./data/james.txt') as jaf: data = jaf.readline() james = data.strip().split(',') with open('./data/julie.txt') as juf: data = juf.readline() julie = data.strip().split(',') with open('./data/mikey.txt') as mif: data = mif.readline() mikey = data.strip().split(',') with open('./data/sarah.txt') as saf: data = saf.readline() sarah = data.strip().split(',') # 지능형 리스트 clean_james = [sanitize(each_t) for each_t in james] clean_julie = [sanitize(each_t) for each_t in julie] clean_mikey = [sanitize(each_t) for each_t in mikey] clean_sarah = [sanitize(each_t) for each_t in sarah] # print (sorted(clean_james)) # print (sorted(clean_julie)) # print (sorted(clean_mikey)) # print (sorted(clean_sarah)) print (sorted(clean_james,reverse=True)) print (sorted(clean_julie,reverse=True)) print (sorted(clean_mikey,reverse=True)) print (sorted(clean_sarah,reverse=True)) help(sorted)
with open('./data/james.txt') as jaf: data = jaf.readline() james = data.strip().split(',') with open('./data/julie.txt') as juf: data = juf.readline() julie = data.strip().split(',') with open('./data/mikey.txt') as mif: data = mif.readline() mikey = data.strip().split(',') with open('./data/sarah.txt') as saf: data = saf.readline() sarah = data.strip().split(',') for each_t in james: clean_james.append(sanitizer.sanitize(each_t)) for each_t in julie: clean_julie.append(sanitizer.sanitize(each_t)) for each_t in mikey: clean_mikey.append(sanitizer.sanitize(each_t)) for each_t in sarah: clean_sarah.append(sanitizer.sanitize(each_t)) # print (sorted(clean_james)) # print (sorted(clean_julie)) # print (sorted(clean_mikey)) # print (sorted(clean_sarah)) print (sorted(clean_james,reverse=True)) print (sorted(clean_julie,reverse=True)) print (sorted(clean_mikey,reverse=True)) print (sorted(clean_sarah,reverse=True))
import os, sanitizer def getList(): sanitizer = html_sanitizer.Sanitizer() files = os. listdir('data') listStr = '' for item in files: item = sanitizer.sanitize(item) listStr = listStr + '<li><a href="index.py?id{name}">{name}</a></ li>'.format(name=item) return listStr
def top3(self): return (sorted(set([sanitize(t) for t in self]))[0:3])
with open('./data/james.txt') as jaf: data = jaf.readline() james = data.strip().split(',') with open('./data/julie.txt') as juf: data = juf.readline() julie = data.strip().split(',') with open('./data/mikey.txt') as mif: data = mif.readline() mikey = data.strip().split(',') with open('./data/sarah.txt') as saf: data = saf.readline() sarah = data.strip().split(',') james = sorted([sanitize(t) for t in james]) julie = sorted([sanitize(t) for t in julie]) mikey = sorted([sanitize(t) for t in mikey]) sarah = sorted([sanitize(t) for t in sarah]) # set 집합으로 변환하면 자동으로 중복 데이터를 제거해준다 # 중복데이터가 발견되면 무시하고 다음 데이터를 입력한다 # 입력은 무작위로 되기 때문에 sort()내장함수를 사용하여 다시 정렬시킨다 set_james = set(james) set_julie = set(julie) set_mikey = set(mikey) set_sarah = set(sarah) print(sorted(set_james)[:3]) print(sorted(set_julie)[:3]) print(sorted(set_mikey)[:3])
args = parser.parse_args() model = args.model src_folder = args.src_folder dst_folder = args.dst_folder num_words = args.num_words num_tags = args.num_tags print(model) print(num_tags) # Reading Train, Dev, Test Files train_df = read_train_data(src_folder, 'train.xml') dev_df = read_train_data(src_folder, 'dev.xml') test_df = read_train_data(src_folder, 'test.xml') # Sanitize Training Data train_df = sanitize(train_df) # train_df.dropna(subset=['AnswerBody'], inplace=True) train_df = train_df[train_df['AnswerBody'].apply(len) != 0] dev_df = sanitize(dev_df) # dev_df.dropna(subset=['AnswerBody'], inplace=True) dev_df = dev_df[dev_df['AnswerBody'].apply(len) != 0] test_df = sanitize(test_df) # test_df.dropna(subset=['AnswerBody'], inplace=True) test_df = test_df[test_df['AnswerBody'].apply(len) != 0] df = pd.concat([train_df, dev_df, test_df]) word_freq = get_vocabs(df) sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
def clean_data(self): return (sorted(set([sanitize(t) for t in self])))
# max_clean_sheets = player["clean_sheets"] # max_clean_sheets_player = player # TODO: sanitize fantasy data for player in fantasy_data["elements"]: player["second_name"] = player["second_name"].replace("é", "e") for (dirpath, dirnames, filenames) in walk(sources_path): for event_path_id in dirnames: event_file = "{}/{}/0/index.html".format(dirpath, event_path_id) sources.append(event_file) for source in sources: raw = event_loader.load(source) if len(raw) > 0: events = sanitizer.sanitize(raw) datetime = events["datetime"] line_ups = events["line_ups"] for player in line_ups["team_1"]["line_up"]: # TODO: patch properly if "Fn" not in player: player["Fn"] = player["Ln"] fantasy_player = [item for item in fantasy_data["elements"] if player["Fn"] in item["first_name"] and player["Ln"] in item["second_name"]] if len(fantasy_player) == 1: now_cost = fantasy_player[0]["now_cost"] # TODO: evaluate initial season cost player["cost"] = now_cost if len(fantasy_player) > 1:
with open('./data/james.txt') as jaf: data = jaf.readline() james = data.strip().split(',') with open('./data/julie.txt') as juf: data = juf.readline() julie = data.strip().split(',') with open('./data/mikey.txt') as mif: data = mif.readline() mikey = data.strip().split(',') with open('./data/sarah.txt') as saf: data = saf.readline() sarah = data.strip().split(',') # 각기 다른 시간/분 구분자를 ":" 으로 통일하기 위한 sanitize()메소드를 수행한 후 정렬 james = sorted([sanitize(t) for t in james]) julie = sorted([sanitize(t) for t in julie]) mikey = sorted([sanitize(t) for t in mikey]) sarah = sorted([sanitize(t) for t in sarah]) # set 집합으로 변환하면 자동으로 중복 데이터를 제거해준다 # 중복데이터가 발견되면 무시하고 다음 데이터를 입력한다 # 입력은 무작위로 되기 때문에 sort()내장함수를 사용하여 다시 정렬시킨다 set_james = set(james) set_julie = set(julie) set_mikey = set(mikey) set_sarah = set(sarah) # 상위 3개의 기록만 보여주기 print(sorted(set_james)[:3]) print(sorted(set_julie)[:3])