def main(search): ''' executes serach for car using make-model-year and cars.com params: search is the reddit message/comment body that should contain the make model year information that we would like to look up ''' search = search.lower() #clean up info to create query output = '' sanitizer = Sanitizer() query = sanitizer.sanitize_input(search) #verify that we received data back, and make a request if query: print('making request with', query) req = requests.get('http://www.cars.com/research/' + query) else: return output #parse html response using beautiful soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/ soup = BeautifulSoup(req.text, 'html.parser') #get the entire html of the site specs = soup.findAll( 'div', {'class': 'mmy-spec'}) #find all list items in the list other_trims = soup.findAll('div', {'class': 'trim_listing'}) #find other trims #print info if bool(specs) or bool(other_trims): output = output + print_information(specs) output = output + '\n\n---\n\n '\ 'in order to not be annoying I am not printing all trims!' # output = output + print_trims(other_trims, output) return output
def add_sanitizer(self, method: Method, sink: Sink, sink_method_idx: int, sanitizer: dict, level: int = 0): """Add a new sanitizer to a sink. New sanitizers are only relevant if a method's parameters are used in a sanitizer. Parameters ---------- method : Method The method where the sanitizer was added sink : Sink The sink to add the sanitizer to sink_method_idx : int The index of the method for which to add the new sanitizer sanitizer : dict A dictionary definition of the sanitizer as defined in the ruleset level : int, optional The depth of the nesting before the sanitizer is reached (sanitizers defined in the rules get a level of 0, sanitizers that call those get a level of 1, sanitizers that call those get a level of 2 and so on) """ new_sanitizer = Sanitizer(sanitizer, level) duplicate = False for existing_sanitizer in sink.methods[sink_method_idx]['Sanitizers']: if existing_sanitizer.object_name != new_sanitizer.object_name: continue if new_sanitizer.methods == existing_sanitizer.methods: duplicate = True break if not duplicate: sink.methods[sink_method_idx]['Sanitizers'].append(new_sanitizer) self.notify_observers(method, changed_sanitizer=True)
def prep_sanits(self): #show how many bottles left self.sanits = Group() for sanit_number in range(self.stats.sanits_left): sanit = Sanitizer(self.s,self.screen) sanit.rect.x = 10 + sanit_number*sanit.rect.width sanit.rect.y =10 self.sanits.add(sanit)
def __init__(self, ifile): self.content = ifile self.sanitizer = Sanitizer() name = ifile.split('_') self.content_id = name[0] self.content_type = name[1] self.content_date = name[2] self.header = {'User-Agent': 'Mozilla/5.0'}
def rungame(): #initializin pygame,settings,and scr objs pygame.init() s = Settings() screen = pygame.display.set_mode((s.screen_width,s.screen_height)) pygame.display.set_caption("CORONA INVASION!") #make the play button play_button = Button(s,screen,"Play") #create instance to store game statistics and create a scoreboard stats = GameStats(s) sb = Scoreboard(s,screen,stats) #time to make a ship sanit = Sanitizer(s , screen) #make a virus #coro = Coro(s,screen) ####optional for now #making a group to store bullets in bubbles = Group() coros = Group() #create fleet of viruses f.create_fleet(s,screen,sanit,coros) #main loop for the game while True: f.check_events(s,screen,stats,sb,play_button,sanit,coros,bubbles) bubbles.update() if stats.game_active: sanit.update() f.update_bubbles(s,screen,stats,sb,sanit,coros,bubbles) f.update_coros(s,screen,stats,sb,sanit,coros,bubbles) f.update_screen(s,screen,stats,sb,sanit,coros,bubbles,play_button)
def get_identifier_flow(self, identifier): if identifier in self.variable_flows: # get existing flow flow = self.variable_flows[identifier] else: # new variable: check if source/sink/sanitizer flows = [] flows.append(Source(identifier, self.is_source(identifier))) flows.append(Sink(identifier, self.is_sink(identifier))) flows.append(Sanitizer(identifier, self.is_sanitizer(identifier))) flow = Flow(flows) return flow
def __init__(self, definition: dict): """Constructor for class `Sink`. Load definitions. Parameters ---------- definition : dict Definitions for the object name, methods and sanitizers """ self.object_name = next(iter(definition)) self.methods = definition[self.object_name]['Methods'] for idx, method in enumerate(self.methods): original_sanitizers = copy.deepcopy(self.methods[idx].get( 'Sanitizers', [])) method['Sanitizers'] = list() for sanitizer in original_sanitizers: method['Sanitizers'].append(Sanitizer(sanitizer)) # Make sure that the definition has a valid format assert all(['Methodname' in method for method in self.methods]) assert all(['Parameters' in method for method in self.methods]) assert all(['Comment' in method for method in self.methods])
def calculate_grade_level(content): words = [] tokens = content.split() garbage_words = ["", ";", ",", "!", "?", "."] number_of_syllables = 0 for token in tokens: sanitized_word = Sanitizer.sanitize_word(token) if sanitized_word in garbage_words: continue else: number_of_syllables += syllables_en.count_syllables(sanitized_word) words.append(sanitized_word) token_arrays = re.split('\.|!|\?', content) sentences = [] for sentence in token_arrays: if sentence not in garbage_words: sentences.append(sentence) number_of_sentences = len(sentences) number_of_words = len(words) print "Syllables: " + str(number_of_syllables) print "Words: " + str(number_of_words) print str(words) print "Sentences: " + str(number_of_sentences) print str(sentences) grade_level = 0.39 * (float(number_of_words) / float(number_of_sentences)) +\ 11.8 * (float(number_of_syllables) / float(number_of_words)) - 15.59 return grade_level
def test_sanitizer(self): sanitizer = Sanitizer() file_removed = sanitizer.clean( '/Users/cv/.m2/repository/ant/ant/1.6.5/ant-1.6.5.jar') print(file_removed)
import nltk import csv from sanitizer import Sanitizer from multiprocessing import Pool csv.field_size_limit(2**30) sanitizer = Sanitizer() filename = 'dataset.csv' def generate_training_data(row): document_id, title, content, date_posted, court = row judgement = sanitizer.extract_judgement(content) sentences = nltk.sent_tokenize(unicode(judgement, errors='ignore')) return map(sanitizer.tokenize, sentences) if __name__ == "__main__": reader = csv.reader(open(filename, 'rb')) reader.next() pool = Pool() results = pool.map(generate_training_data, reader) sentences = [] for sent in results: sentences.extend(sent) with open('sentences.txt', 'w') as f: for sentence in sentences: f.write(' '.join(sentence) + '\n')
def makeAbove(arguments): global template_path_g, output_path_g, script_dir_g mySanitizer = Sanitizer( ) # default sanitizer for mAm. Clears JS and CSS, leaves html in. # for every argument, check if set and handle accordingly with open(template_path_g, 'r') as templateFile: template = Template(templateFile.read()) # set title if there should be one if arguments['--title'] is not None: title = arguments['--title'] else: title = "" # clean title title = mySanitizer.cleanHTML(title) image = "" if arguments['--image'] is not None: image = arguments['--image'] image = mySanitizer.cleanHTML(image) # create all tags and store them in one long string global TAG_HTML_TEMPLATE_STRING alltags = "" for tag in arguments['--tag']: alltags += TAG_HTML_TEMPLATE_STRING.substitute( {'tagtext': mySanitizer.cleanHTML(tag)}) # for the line with points and comments global COMMENTLINE_TEMPLATE_STRING if arguments['-C'] is not None: argsC = mySanitizer.cleanHTML(arguments['-C']) commentline = '<a href="" class="C">{0}</a>'.format(argsC) elif (arguments['--comments'] is None) and (arguments['--points'] is None): commentline = "" else: comments = 0 if arguments['--comments'] is None else arguments[ '--comments'] points = 0 if arguments['--points'] is None else arguments['--points'] comments = mySanitizer.cleanHTML(comments) points = mySanitizer.cleanHTML(points) subC = "{0} comments".format(comments) subP = "{0} points".format(points) commentline = COMMENTLINE_TEMPLATE_STRING.substitute({ 'points': subP, 'comments': subC }) # set text if there should be global TXT_TEMPLATE_STRING text = '' if arguments['--text'] is not None: text = arguments['--text'] text = mySanitizer.cleanHTML(text) text = TXT_TEMPLATE_STRING.substitute( {'text': text}) # write text into html-string substDir = { 'title': title, 'image': image, 'tags': alltags, 'commentline': commentline, 'text': text } tempStr = template.substitute(substDir) # write result to temp file (fd, filename) = tempfile.mkstemp( suffix='.html', dir=script_dir_g ) # create the tempfile in the script-containing directory try: tfile = os.fdopen(fd, "w") tfile.write(tempStr) tfile.close() if not arguments['-X']: webkitres = subprocess.check_output([ "webkit2png", filename, "-o", output_path_g, "-x", "70", "1000" ]) else: webkitres = subprocess.check_output( ["webkit2png", filename, "-o", output_path_g]) print("Called webkit2png with filename {0} and output path {1}".format( filename, output_path_g)) except subprocess.CalledProcessError as e: print("webkit2png failed. DO SOMETHING." ) # handle error of webkit2png? I don't know how, so not my job exit(2) finally: os.remove(filename)
"dziwi bo tak siÄ™ zawsze robi to jest taka praktyka" class m_news(object): bodies = ['dsa dsa fdasfsd gdfg dfg dfg fdgerg ghrhy etg ger gre ger', 'dsa dsa fdasfsd gdfg dfg ee fdgerg fd etg ger gre asd', 'dsa rer fdasfsd azz ghrhy etg ger gre ger', 'dsa dsa fdas dfg dfg dfg ger gre ger', 'dsa fda ghrhy gdfg ger gre dsa',] def __init__(self): pass def __call__(self, arg): return News(title='abc', body=m_news.bodies[arg], clean_body=m_news.bodies[arg], url="http://dfsd.fd.com", date=int(time())) from newsgroup import NewsGroup from sanitizer import Sanitizer s=Sanitizer() ng = NewsGroup() m_news = m_news() nr = [ng.quantity_reduce(m_news(i)) for i in range(0, len(m_news.bodies))] print s.cleanup_news(dirty_news) for i in range(0, len(m_news.bodies)): for j in range(i, len(m_news.bodies)): print "("+str(i)+","+str(j)+")"+str(ng.cosine_distance(nr[i], nr[j])) print("jaccard") nr = [ng.binary_reduce(m_news(i)) for i in range(0, len(m_news.bodies))] for i in range(0, len(m_news.bodies)): for j in range(i, len(m_news.bodies)): print "("+str(i)+","+str(j)+")"+str(ng.jaccard_index(nr[i], nr[j]))
] def __init__(self): pass def __call__(self, arg): return News(title='abc', body=m_news.bodies[arg], clean_body=m_news.bodies[arg], url="http://dfsd.fd.com", date=int(time())) from newsgroup import NewsGroup from sanitizer import Sanitizer s = Sanitizer() ng = NewsGroup() m_news = m_news() nr = [ng.quantity_reduce(m_news(i)) for i in range(0, len(m_news.bodies))] print s.cleanup_news(dirty_news) for i in range(0, len(m_news.bodies)): for j in range(i, len(m_news.bodies)): print "(" + str(i) + "," + str(j) + ")" + str( ng.cosine_distance(nr[i], nr[j])) print("jaccard") nr = [ng.binary_reduce(m_news(i)) for i in range(0, len(m_news.bodies))] for i in range(0, len(m_news.bodies)): for j in range(i, len(m_news.bodies)): print "(" + str(i) + "," + str(j) + ")" + str(
class OHHLAScraper: OHHLA_URL = "http://ohhla.com/" ALL_ARTIST_SITES = ["http://ohhla.com/all.html", "http://ohhla.com/all_two.html", "http://ohhla.com/all_three.html", "http://ohhla.com/all_four.html", "http://ohhla.com/all_five.html"] TOP_ARTIST_SITES = ["http://ohhla.com/favorite.html"] EXCLUDED_ARTISTS = {'113'} def __init__(self, output_directory): self.output_directory = output_directory self.sanitizer = Sanitizer() def scrape_all_artists(self): for url in self.ALL_ARTIST_SITES: self._scrape_all_artists_page(url) def scrape_top_artists(self): for url in self.TOP_ARTIST_SITES: self._scrape_top_artists_page(url) def _scrape_all_artists_page(self, url): dom = self._extract_dom(url) artist_refs = dom.xpath("//pre/a[@href]/@href") for artist_ref in artist_refs: ref_split = artist_ref.rsplit('/') if not artist_ref or self._is_parent_ref(url, artist_ref) or len(ref_split) < 2: continue artist_name = ref_split[-2] if not artist_name or artist_name in self.EXCLUDED_ARTISTS: continue artist_url = self.OHHLA_URL + artist_ref artist_file_name = '{}/{}.txt'.format(self.output_directory, artist_name) with open(artist_file_name, 'w') as output_file: self._scrape_artist_page(artist_url, output_file) output_file.write('\n') def _scrape_top_artists_page(self, url): dom = self._extract_dom(url) artist_refs = dom.xpath("//td/a[@href]/@href") for artist_ref in artist_refs: artist_name = artist_ref.replace('YFA_', '').replace('.html', '') if not artist_name or artist_name in self.EXCLUDED_ARTISTS: continue artist_url = self.OHHLA_URL + artist_ref artist_file_name = '{}/{}.txt'.format(self.output_directory, artist_name) with open(artist_file_name, 'w') as output_file: self._scrape_top_artist_page(artist_url, output_file) output_file.write('\n') def _scrape_artist_page(self, url, output_file): try: dom = self._extract_dom(url) except: return album_refs = dom.xpath("//tr/td/a[@href]/@href") for album_ref in album_refs: if not album_ref or self._is_parent_ref(url, album_ref): continue album_url = url + album_ref self._scrape_album_page(album_url, output_file) def _scrape_album_page(self, url, output_file): try: dom = self._extract_dom(url) except: return song_refs = dom.xpath("//tr/td/a[@href]/@href") for song_ref in song_refs: if not song_ref or self._is_parent_ref(url, song_ref): continue song_url = url + song_ref self._scrape_song_page(song_url, output_file) def _scrape_top_artist_page(self, url, output_file, recurse=True): try: dom = self._extract_dom(url) except: return song_refs = dom.xpath("//tr/td/a[@href]/@href") for song_ref in song_refs: if not song_ref: continue elif song_ref.endswith('.txt'): song_url = self.OHHLA_URL + song_ref self._scrape_song_page(song_url, output_file) elif song_ref.endswith('html') and recurse: next_url = self.OHHLA_URL + song_ref self._scrape_top_artist_page(next_url, output_file, recurse=False) def _scrape_song_page(self, url, output_file): try: opened_url = urllib.request.urlopen(url) dom = opened_url.read() except: return if re.match(r'^b[\'\"]<!DOCTYPE.*?>', str(dom)) is not None: song_html = html.fromstring(dom) try: lyrics = song_html.xpath("//pre/text()")[0] except: return else: lyrics = dom.decode("utf-8", "ignore") cleaned_lyrics = self.sanitizer.clean_lyrics(lyrics) output_file.write(cleaned_lyrics) output_file.write('\n') def _is_parent_ref(self, url, ref): start_of_relative_ref = len(self.OHHLA_URL) - 1 end_of_relative_ref = url.rindex('/', 0, len(url) - 1) + 1 relative_ref = url[start_of_relative_ref:end_of_relative_ref] return relative_ref == ref @staticmethod def _extract_dom(url): opened_url = urllib.request.urlopen(url) return html.fromstring(opened_url.read())
def __init__(self, output_directory): self.output_directory = output_directory self.sanitizer = Sanitizer()
class contentParser: def __init__(self, ifile): self.content = ifile self.sanitizer = Sanitizer() name = ifile.split('_') self.content_id = name[0] self.content_type = name[1] self.content_date = name[2] self.header = {'User-Agent': 'Mozilla/5.0'} def parse(self): conn = db.connect(**db_config) cursor = conn.cursor() with open(self.content, 'r') as fp: content = json.load(fp) print json.dumps(content, indent=4, sort_keys=True) for item in content: time.sleep(2) attack_type = item['attack_type'] attack_category = item['attack_category'] author = item['author'] country = item['country'] date = item['date'] target = item['target'] target_category = item['target_category'] my_count = 0 for url in item['link']: res = '' if 'video' in url: continue elif 'image' in url: continue elif 'www.pravdareport.com' in url: res = self.parse_pravda(url) media = 'pravda' elif 'www.rt.com' in url: res = self.parse_rt(url) media = 'rt' elif 'www.washingtonpost.com' in url: res = self.parse_wp(url) media = 'washingtonpost' elif 'www.nytimes.com' in url: res = self.parse_nyt(url) media = 'nytimes' elif 'www.japantimes.co.jp' in url: res = self.parse_jpt(url) media = 'japantimes' elif 'www.nbcnews.com' in url: res = self.parse_nbc(url) media = 'nbcnews' elif 'www.theguardian.com' in url: res = self.parse_guardian(url) media = 'theguardian' elif 'www.bbc.com' in url: res = self.parse_bbc(url) media = 'bbc' elif 'news.yahoo.com' in url: res = self.parse_yahoo(url) media = 'yahoo' elif 'www.foxnews.com' in url: res = self.parse_fox(url) media = 'foxnews' elif 'www3.nhk.or.jp' in url: res = self.parse_nhk(url) media = 'nhk' elif 'www.chinadaily.com.cn' in url: res = self.parse_cndaily(url) media = 'chinadaily' elif 'www.aljazeera.com' in url: res = self.parse_alj(url) media = 'aljazeera' elif 'www.moscowtimes.com' in url: res = self.parse_moscowt(url) media = 'moscowtimes' elif 'www.shanghaidaily.com' in url: res = self.parse_shanghaid(url) media = 'shanghaidaily' my_count += 1 if res: res = self.sanitizer.sanitize(res) print self.content_id,self.content_type,url print res query = ("INSERT INTO ca_analyze " "(attack_id, target, date, author, attack_type, attack_category, target_category, country, flag, count, content, media, url) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) " "ON DUPLICATE KEY UPDATE author=%s, attack_type=%s, attack_category=%s, target_category=%s, country=%s, " "flag=%s, count=%s, content=%s" ) cursor.execute(query, (self.content_id, target, self.content_date, author, attack_type, attack_category, target_category, country, self.content_type, my_count, res, media, url, author, attack_type, attack_category, target_category, country, self.content_type, my_count, res, )) conn.close() def parse_init(self, url): cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')] html = opener.open(url, timeout=100) return bs(html.read(), 'html.parser') def parse_init_off(self, url): req = urllib2.Request(url, headers=self.header) html = urllib2.urlopen(req, timeout=100) return bs(html.read(), 'html.parser') def parse_pravda(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'id': 'article'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_rt(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: res = text.body.find('div', attrs={'class': 'article__summary'}).get_text() article = text.body.find('div', attrs={'class': 'article__text'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_yahoo(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'class': 'yom-art-content'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_nyt(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'id': 'story-body'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_wp(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('article', attrs={'itemprop': 'articleBody'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_shanghaid(self, url): res = '' try: httplib.HTTPConnection.debuglevel = 1 request = urllib2.Request(url) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener() html = opener.open(request) data = html.read() data = StringIO.StringIO(data) gzipper = gzip.GzipFile(fileobj=data) text = gzipper.read() text = bs(text, 'html.parser') article = text.find('div', attrs={'class': 'detail_content'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res except Exception: return res def parse_jpt(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'id': 'jtarticle'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_nbc(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'itemprop': 'articleBody'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_bbc(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'property': 'articleBody'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_fox(self, url): res = '' try: text = self.parse_init_off(url) except Exception: text = self.parse_init(url) except Exception: return res else: article = text.body.find('div', attrs={'itemprop': 'articleBody'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_nhk(self, url): res = '' try: text = self.parse_init(url) except Exception: text = self.parse_init_off(url) except Exception: return res else: article = text.body.find('div', attrs={'class': 'content'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_cndaily(self, url): res = '' try: text = self.parse_init_off(url) except Exception: text = self.parse_init(url) except Exception: return res else: article = text.find('div', attrs={'id': 'Content'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_alj(self, url): res = '' try: text = self.parse_init_off(url) except Exception: text = self.parse_init(url) except Exception: return res else: article = text.body.find('div', attrs={'id': 'article-body'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_moscowt(self, url): res = '' try: text = self.parse_init_off(url) except Exception: text = self.parse_init(url) except Exception: return res else: article = text.body.find('div', attrs={'class': 'article_text'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res def parse_guardian(self, url): res = '' try: text = self.parse_init_off(url) except Exception: text = self.parse_init(url) except Exception: return res else: article = text.body.find('div', attrs={'itemprop': 'articleBody'}) if article: article = article.find_all("p") for p in article: res += p.get_text() res += ' ' return res