Example #1
0
def run():
    args = get_args()

    train_set = read_data(data_folder / args.train)
    test_set = read_data(data_folder / args.test,
                         col_names=("tweet_id", "text", "q1_label"))

    regular_solution = Naive_Bayes(train_set, test_set, ['yes', 'no'], False)
    filtered_solution = Naive_Bayes(train_set, test_set, ['yes', 'no'], True)

    output_trace(output_folder / "trace_NB-BOW-OV.txt", regular_solution)
    output_trace(output_folder / "trace_NB-BOW-FV.txt", filtered_solution)

    evaluate(output_folder / "eval_NB-BOW-OV.txt", regular_solution, "yes",
             "no")
    evaluate(output_folder / "eval_NB-BOW-FV.txt", filtered_solution, "yes",
             "no")

    #using sanitized input
    train_set_sanitized = sanitize(train_set)
    test_set_sanitized = sanitize(test_set)

    sanitized_solution = Naive_Bayes(train_set_sanitized, test_set_sanitized,
                                     ['yes', 'no'], False)
    output_trace(output_folder / "trace_NB-BOW-OV_sanitized.txt",
                 sanitized_solution)
    evaluate(output_folder / "eval_NB-BOW-OV_sanitized.txt",
             sanitized_solution, "yes", "no")
 def test_string_sanitizer_removes_spaces(self):
     # Arrange
     expected = "00201485672"
     # Act
     result = sanitize("0 0201485672")
     # Assert
     self.assertEqual(expected, result)
Example #3
0
 def __init__(
     self,
     text,
     sanitize=False,
     permitted_tags=[
         'a',
         'b',
         'blockquote',
         'br/',
         'i',
         'li',
         'ol',
         'ul',
         'p',
         'cite',
         'code',
         'pre',
         'img/',
     ],
     allowed_attributes={
         'a': ['href', 'title'],
         'img': ['src', 'alt'],
         'blockquote': ['type']
     },
 ):
     if sanitize:
         text = sanitizer.sanitize(text, permitted_tags, allowed_attributes)
     if isinstance(text, unicode):
         text = text.encode('utf8', 'xmlcharrefreplace')
     elif not isinstance(text, str):
         text = str(text)
     self.text = text
 def test_string_sanitizer_removes_dashes(self):
     # Arrange
     expected = "9780596809485"
     # Act
     result = sanitize("978-0596809485")
     # Assert
     self.assertEqual(expected, result)
Example #5
0
 def __init__(
     self,
     text,
     sanitize=False,
     permitted_tags=[
         'a',
         'b',
         'blockquote',
         'br/',
         'i',
         'li',
         'ol',
         'ul',
         'p',
         'cite',
         'code',
         'pre',
         'img/',
         ],
     allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt'
                         ], 'blockquote': ['type']},
     ):
     if sanitize:
         text = sanitizer.sanitize(text, permitted_tags,
                 allowed_attributes)
     if isinstance(text, unicode):
         text = text.encode('utf8', 'xmlcharrefreplace')
     elif not isinstance(text, str):
         text = str(text)
     self.text = text
Example #6
0
def load_dataframe(filename):
    """ input: filename of pickle dataframe ready_for_R csv"""

    if os.path.isfile(const.DATAPATH + filename + ".pkl"):
        return pd.read_pickle(const.DATAPATH + filename + ".pkl")
    else:
        logger.error("NO SANITIZED PICKLE: " + const.DATAPATH + filename)
        logger.error("SANITIZE DATA FIRST")
        df = sanitizer.sanitize()
    return df
Example #7
0
def get_coach_data(path):
	try:
		with open(path) as reader:
			data = reader.readline()
		data_split = data.strip().split(',')
		athlete_data = Athlete(data_split.pop(0), data_split.pop(0), sorted(set([sanitize(t) for t in data_split]))[0:3])
		return athlete_data

	except IOError as ioerror:
		print('File Error: ' + str(ioerror))	
		return(None)
def main():
    # Setup logging
    logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s',
                        level=logging.INFO)

    divider = '-' * 30

    # Login to database
    logging.info(divider + " DATABASE LOGIN " + divider)
    db = MongoClient(os.environ.get("TRITON_ANALYTICS_MONGODB")).get_database(
        "tritonanalytics")

    # Download recent analytics
    logging.info(divider + " DOWNLOAD PROCESS " + divider)
    download(os.environ.get("TRITON_ANALYTICS_FBID"),
             os.environ.get("TRITON_ANALYTICS_FBTOKEN"))

    # Export all stored analytics
    logging.info(divider + " EXPORT PROCESS " + divider)
    export_to(db)

    # Sanitize database
    logging.info(divider + " SANITIZATION " + divider)
    sanitize(db)
Example #9
0
    def __init__(
        self,
        text,
        sanitize = False,
        permitted_tags = [
            'a',
            'b',
            'blockquote',
            'br/',
            'i',
            'li',
            'ol',
            'ul',
            'p',
            'cite',
            'code',
            'pre',
            'img/',
            ],
        allowed_attributes = {
            'a': ['href', 'title'],
            'img': ['src', 'alt'],
            'blockquote': ['type']
            },
        ):
        """
        :param text: the XML text
        :param sanitize: sanitize text using the permitted tags and allowed
            attributes (default False)
        :param permitted_tags: list of permitted tags (default: simple list of
            tags)
        :param allowed_attributes: dictionary of allowed attributed (default
            for A, IMG and BlockQuote).
            The key is the tag; the value is a list of allowed attributes.
        """

        if sanitize:
            text = sanitizer.sanitize(text, permitted_tags,
                    allowed_attributes)
        if isinstance(text, unicode):
            text = text.encode('utf8', 'xmlcharrefreplace')
        elif not isinstance(text, str):
            text = str(text)
        self.text = text
Example #10
0
    def __init__(
        self,
        text,
        sanitize=False,
        permitted_tags=[
            'a',
            'b',
            'blockquote',
            'br/',
            'i',
            'li',
            'ol',
            'ul',
            'p',
            'cite',
            'code',
            'pre',
            'img/',
        ],
        allowed_attributes={
            'a': ['href', 'title'],
            'img': ['src', 'alt'],
            'blockquote': ['type']
        },
    ):
        """
        :param text: the XML text
        :param sanitize: sanitize text using the permitted tags and allowed
            attributes (default False)
        :param permitted_tags: list of permitted tags (default: simple list of
            tags)
        :param allowed_attributes: dictionary of allowed attributed (default
            for A, IMG and BlockQuote).
            The key is the tag; the value is a list of allowed attributes.
        """

        if sanitize:
            text = sanitizer.sanitize(text, permitted_tags, allowed_attributes)
        if isinstance(text, unicode):
            text = text.encode('utf8', 'xmlcharrefreplace')
        elif not isinstance(text, str):
            text = str(text)
        self.text = text
Example #11
0
	def top3(self):
		return sorted(set([sanitize(t) for t in self.times]))[0:3]
Example #12
0
with open('./data/james.txt') as jaf:
	data = jaf.readline()
james = data.strip().split(',')
with open('./data/julie.txt') as juf:
	data = juf.readline()
julie = data.strip().split(',')
with open('./data/mikey.txt') as mif:
	data = mif.readline()
mikey = data.strip().split(',')
with open('./data/sarah.txt') as saf:
	data = saf.readline()
sarah = data.strip().split(',')

# 지능형 리스트
clean_james = [sanitize(each_t) for each_t in james]
clean_julie = [sanitize(each_t) for each_t in julie]
clean_mikey = [sanitize(each_t) for each_t in mikey]
clean_sarah = [sanitize(each_t) for each_t in sarah]

# print (sorted(clean_james))
# print (sorted(clean_julie))
# print (sorted(clean_mikey))
# print (sorted(clean_sarah))
print (sorted(clean_james,reverse=True))
print (sorted(clean_julie,reverse=True))
print (sorted(clean_mikey,reverse=True))
print (sorted(clean_sarah,reverse=True))

help(sorted)
Example #13
0
with open('./data/james.txt') as jaf:
	data = jaf.readline()
james = data.strip().split(',')
with open('./data/julie.txt') as juf:
	data = juf.readline()
julie = data.strip().split(',')
with open('./data/mikey.txt') as mif:
	data = mif.readline()
mikey = data.strip().split(',')
with open('./data/sarah.txt') as saf:
	data = saf.readline()
sarah = data.strip().split(',')

for each_t in james:
	clean_james.append(sanitizer.sanitize(each_t))
for each_t in julie:
	clean_julie.append(sanitizer.sanitize(each_t))
for each_t in mikey:
	clean_mikey.append(sanitizer.sanitize(each_t))
for each_t in sarah:
	clean_sarah.append(sanitizer.sanitize(each_t))

# print (sorted(clean_james))
# print (sorted(clean_julie))
# print (sorted(clean_mikey))
# print (sorted(clean_sarah))
print (sorted(clean_james,reverse=True))
print (sorted(clean_julie,reverse=True))
print (sorted(clean_mikey,reverse=True))
print (sorted(clean_sarah,reverse=True))
Example #14
0
import os, sanitizer
def getList():
    sanitizer = html_sanitizer.Sanitizer()
   files = os. listdir('data')
   listStr = ''
   for item in files:
       item = sanitizer.sanitize(item)
       listStr = listStr + '<li><a href="index.py?id{name}">{name}</a></   li>'.format(name=item)
   return listStr
Example #15
0
	def top3(self):
		return (sorted(set([sanitize(t) for t in self]))[0:3])
Example #16
0

with open('./data/james.txt') as jaf:
	data = jaf.readline()
james = data.strip().split(',')
with open('./data/julie.txt') as juf:
	data = juf.readline()
julie = data.strip().split(',')
with open('./data/mikey.txt') as mif:
	data = mif.readline()
mikey = data.strip().split(',')
with open('./data/sarah.txt') as saf:
	data = saf.readline()
sarah = data.strip().split(',')

james = sorted([sanitize(t) for t in james])
julie = sorted([sanitize(t) for t in julie])
mikey = sorted([sanitize(t) for t in mikey])
sarah = sorted([sanitize(t) for t in sarah])

# set 집합으로 변환하면 자동으로 중복 데이터를 제거해준다
# 중복데이터가 발견되면 무시하고 다음 데이터를 입력한다
# 입력은 무작위로 되기 때문에 sort()내장함수를 사용하여 다시 정렬시킨다
set_james = set(james)
set_julie = set(julie)
set_mikey = set(mikey)
set_sarah = set(sarah)

print(sorted(set_james)[:3])
print(sorted(set_julie)[:3])
print(sorted(set_mikey)[:3])
Example #17
0
    args = parser.parse_args()
    model = args.model
    src_folder = args.src_folder
    dst_folder = args.dst_folder
    num_words = args.num_words
    num_tags = args.num_tags
    print(model)
    print(num_tags)

    # Reading Train, Dev, Test Files
    train_df = read_train_data(src_folder, 'train.xml')
    dev_df = read_train_data(src_folder, 'dev.xml')
    test_df = read_train_data(src_folder, 'test.xml')

    # Sanitize Training Data
    train_df = sanitize(train_df)
    # train_df.dropna(subset=['AnswerBody'], inplace=True)
    train_df = train_df[train_df['AnswerBody'].apply(len) != 0]

    dev_df = sanitize(dev_df)
    # dev_df.dropna(subset=['AnswerBody'], inplace=True)
    dev_df = dev_df[dev_df['AnswerBody'].apply(len) != 0]

    test_df = sanitize(test_df)
    # test_df.dropna(subset=['AnswerBody'], inplace=True)
    test_df = test_df[test_df['AnswerBody'].apply(len) != 0]

    df = pd.concat([train_df, dev_df, test_df])

    word_freq = get_vocabs(df)
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
Example #18
0
	def clean_data(self):
	    return (sorted(set([sanitize(t) for t in self])))
Example #19
0
#         max_clean_sheets = player["clean_sheets"]
#         max_clean_sheets_player = player

# TODO: sanitize fantasy data
for player in fantasy_data["elements"]:
    player["second_name"] = player["second_name"].replace("é", "e")

for (dirpath, dirnames, filenames) in walk(sources_path):
    for event_path_id in dirnames:
        event_file = "{}/{}/0/index.html".format(dirpath, event_path_id)
        sources.append(event_file)

for source in sources:
    raw = event_loader.load(source)
    if len(raw) > 0:
        events = sanitizer.sanitize(raw)
        datetime = events["datetime"]
        line_ups = events["line_ups"]

        for player in line_ups["team_1"]["line_up"]:
            # TODO: patch properly
            if "Fn" not in player:
                player["Fn"] = player["Ln"]

            fantasy_player = [item for item in fantasy_data["elements"] if player["Fn"] in item["first_name"] and player["Ln"] in item["second_name"]]

            if len(fantasy_player) == 1:
                now_cost = fantasy_player[0]["now_cost"]  # TODO: evaluate initial season cost
                player["cost"] = now_cost

            if len(fantasy_player) > 1:
Example #20
0
with open('./data/james.txt') as jaf:
	data = jaf.readline()
james = data.strip().split(',')
with open('./data/julie.txt') as juf:
	data = juf.readline()
julie = data.strip().split(',')
with open('./data/mikey.txt') as mif:
	data = mif.readline()
mikey = data.strip().split(',')
with open('./data/sarah.txt') as saf:
	data = saf.readline()
sarah = data.strip().split(',')


# 각기 다른 시간/분 구분자를 ":" 으로 통일하기 위한 sanitize()메소드를 수행한 후 정렬
james = sorted([sanitize(t) for t in james])
julie = sorted([sanitize(t) for t in julie])
mikey = sorted([sanitize(t) for t in mikey])
sarah = sorted([sanitize(t) for t in sarah])

# set 집합으로 변환하면 자동으로 중복 데이터를 제거해준다
# 중복데이터가 발견되면 무시하고 다음 데이터를 입력한다
# 입력은 무작위로 되기 때문에 sort()내장함수를 사용하여 다시 정렬시킨다
set_james = set(james)
set_julie = set(julie)
set_mikey = set(mikey)
set_sarah = set(sarah)

# 상위 3개의 기록만 보여주기
print(sorted(set_james)[:3])
print(sorted(set_julie)[:3])