def __init__(self, url, query): self.url = None self.raw_html = None self.crawl_date = None self.status = None self.error_type = None self.status_code = None self.type = "page" self.url = url self.query = query self.crawl_date = self.start_date = date.today() self.unwanted_extensions = [ 'css', 'js', 'gif', 'asp', 'GIF', 'jpeg', 'JPEG', 'jpg', 'JPG', 'pdf', 'PDF', 'ico', 'ICO', 'png', 'PNG', 'dtd', 'DTD', 'mp4', 'mp3', 'mov', 'zip', 'bz2', 'gz', ] self.adblock = Filter(file('easylist.txt')) self.create()
def handle_filters(filter_list): filter_dict = {} for idx, filter in enumerate(filter_list): filter_ins = Filter(f_id=filter['id'], f_name=filter['name'], f_type=filter['type'], f_update=filter['updated']) if filter['type'] == "SEARCH_AND_REPLACE": filter_ins.details = filter['searchAndReplaceDetails'] elif filter['type'] == "INCLUDE": filter_ins.details = filter["includeDetails"] elif filter['type'] == "EXCLUDE": filter_ins.details = filter["excludeDetails"] elif filter['type'] == "LOWERCASE": filter_ins.details = filter["lowercaseDetails"] elif filter['type'] == "UPPERCASE": filter_ins.details = filter["uppercaseDetails"] elif filter['type'] == "ADVANCED": filter_ins.details = filter["advancedDetails"] else: filter_ins.details = {'key': 'value'} filter_dict["filter_{}".format(idx)] = filter_ins return filter_dict
def __init__(self, url, query): self.url = None self.raw_html = None self.crawl_date = None self.status = None self.error_type = None self.status_code = None self.type = "page" self.url = url self.query = query self.crawl_date = self.start_date = date.today() self.unwanted_extensions = ['css','js','gif','asp', 'GIF','jpeg','JPEG','jpg','JPG','pdf','PDF','ico','ICO','png','PNG','dtd','DTD', 'mp4', 'mp3', 'mov', 'zip','bz2', 'gz', ] self.adblock = Filter(file('easylist.txt')) self.create()
class Page(object): def __init__(self, url, query): self.url = None self.raw_html = None self.crawl_date = None self.status = None self.error_type = None self.status_code = None self.type = "page" self.url = url self.query = query self.crawl_date = self.start_date = date.today() self.unwanted_extensions = ['css','js','gif','asp', 'GIF','jpeg','JPEG','jpg','JPG','pdf','PDF','ico','ICO','png','PNG','dtd','DTD', 'mp4', 'mp3', 'mov', 'zip','bz2', 'gz', ] self.adblock = Filter(file('easylist.txt')) self.create() def create(self): if self.check() and self.request() and self.control(): return Article() else: return self.bad_status() def check(self): '''Bool: check the format of the next url compared to curr url''' if self.url is None or len(self.url) <= 1 or self.url == "\n": self.error_type = "Url is empty" self.status_code = 204 self.status = False return False elif (( self.url.split('.')[-1] in self.unwanted_extensions ) and ( len( self.adblock.match(self.url) ) > 0 ) ): self.error_type="Url has not a proprer extension or page is an advertissement" self.status_code = 204 self.status = False return False else: self.status = True return True def request(self): '''Bool request a webpage: return boolean and update src''' try: requests.adapters.DEFAULT_RETRIES = 2 user_agents = [u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1', u'Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2', u'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0', u'Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00'] headers = {'User-Agent': choice(user_agents),} proxies = {"https":"77.120.126.35:3128", "https":'88.165.134.24:3128', } try: self.req = requests.get((self.url), headers = headers,allow_redirects=True, proxies=None, timeout=5) try: self.raw_html = self.req.text self.status = True return True except Exception, e: self.error_type = "Request answer was not understood %s" %e self.status_code = 400 self.status = False return False else: self.error_type = "Not relevant" self.status_code = 0 self.status = True return False
def build_one_dataset(self, curr_data): # Unpack the data related info, num_examples is not used curr_data_path, _, extra_tensors = curr_data # Dictionary with keys being source, and values being directories self.source_paths = { source: os.path.join(curr_data_path, source) \ for source in self.sources } # load filters, add that to source_paths if self.filter_rule: self.filter = Filter(self.filter_rule) for f in self.filter.keys: self.source_paths[f] = os.path.join(curr_data_path, f) if f not in self.all_sources: self.all_sources.append(f) else: self.filter = None # load metas self.meta_dict = self.parse_standard_tfmeta(self.source_paths) # Get tfr filenames source_lists = { source: self.get_tfr_filenames( self.source_paths[source], file_pattern=self.file_pattern) \ for source in self.source_paths} # This shuffle needs to be False to keep the order of every attribute # the same file_datasets = { source: tf.data.Dataset.list_files(curr_files, shuffle=False) \ for source, curr_files in source_lists.items()} if self.is_training: # Shuffle file names using the same shuffle_seed file_datasets = { source: curr_dataset.shuffle( buffer_size=len(source_lists.values()[0]), seed=self.shuffle_seed).repeat() \ for source,curr_dataset in file_datasets.items()} # Create dataset for both def _fetch_dataset(filename): buffer_size = 8 * 1024 * 1024 # 8 MiB per file dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size) return dataset each_dataset = { source: curr_dataset.apply( tf.contrib.data.parallel_interleave( _fetch_dataset, cycle_length=1, sloppy=False)) \ for source,curr_dataset in file_datasets.items() } # Decode raw first before zip each_dataset = { source: curr_dataset.map( lambda x: self.postproc_each(x, source), num_parallel_calls=self.map_pcall_num, ) \ for source, curr_dataset in each_dataset.items() } # Zip, repeat, batch zip_dataset = tf.data.Dataset.zip(each_dataset) zip_dataset = zip_dataset.repeat() zip_dataset = zip_dataset.batch(self.enqueue_batch_size) # Set shape (first dimension to be batchsize) zip_dataset = zip_dataset.map( lambda x: { key: self.set_data_shape(value) for key,value in x.items()}, num_parallel_calls=self.map_pcall_num) # Create sequence for each dataset zip_dataset = zip_dataset.map( lambda x: { key: self.create_data_sequence(value) for key, value in x.items()}, num_parallel_calls=self.map_pcall_num) # Add extra tensors def add_extra_tensors(value): for extra_key, extra_tensor in extra_tensors.items(): assert extra_key not in value batch_size = value[value.keys()[0]].get_shape().as_list()[0] time = value[value.keys()[0]].get_shape().as_list()[1] extra_tensor = tf.constant(extra_tensor, dtype=tf.float32) extra_shape = extra_tensor.get_shape().as_list() value[extra_key] = tf.tile( tf.reshape( extra_tensor, [1, 1] + extra_shape), [batch_size, time] + [1] * len(extra_shape)) if extra_key not in self.all_sources: self.all_sources.append(extra_key) return value zip_dataset = zip_dataset.map( add_extra_tensors, num_parallel_calls=self.map_pcall_num) return zip_dataset
from flask import Flask, request, send_file from utils import Filter, formatResponse, formatPredictionInput import pickle import sys import pdb filter = Filter('pythonsqlite.db') app = Flask(__name__) sat_model = pickle.load(open('primary_sat_model.sav', 'rb')) @app.route("/scores") @formatResponse def send_scores(): if all(arg in request.args for arg in ['score','conditional','subject']): return filter.byScore(request.args) elif 'subject' in request.args: return filter.bySubject(request.args['subject']) elif 'school' in request.args: return filter.bySchool(request.args['school']) @app.route("/matrix") @formatResponse def send_matrix(): res = send_file('matrix.svg', mimetype="image/svg+xml") return res @app.route("/predict") @formatResponse def predict(): input = formatPredictionInput(request.args) pdb.set_trace()
class Page(object): def __init__(self, url, query): self.url = None self.raw_html = None self.crawl_date = None self.status = None self.error_type = None self.status_code = None self.type = "page" self.url = url self.query = query self.crawl_date = self.start_date = date.today() self.unwanted_extensions = [ 'css', 'js', 'gif', 'asp', 'GIF', 'jpeg', 'JPEG', 'jpg', 'JPG', 'pdf', 'PDF', 'ico', 'ICO', 'png', 'PNG', 'dtd', 'DTD', 'mp4', 'mp3', 'mov', 'zip', 'bz2', 'gz', ] self.adblock = Filter(file('easylist.txt')) self.create() def create(self): if self.check() and self.request() and self.control(): return Article() else: return self.bad_status() def check(self): '''Bool: check the format of the next url compared to curr url''' if self.url is None or len(self.url) <= 1 or self.url == "\n": self.error_type = "Url is empty" self.status_code = 204 self.status = False return False elif ((self.url.split('.')[-1] in self.unwanted_extensions) and (len(self.adblock.match(self.url)) > 0)): self.error_type = "Url has not a proprer extension or page is an advertissement" self.status_code = 204 self.status = False return False else: self.status = True return True def request(self): '''Bool request a webpage: return boolean and update src''' try: requests.adapters.DEFAULT_RETRIES = 2 user_agents = [ u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1', u'Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2', u'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0', u'Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00' ] headers = { 'User-Agent': choice(user_agents), } proxies = { "https": "77.120.126.35:3128", "https": '88.165.134.24:3128', } try: self.req = requests.get((self.url), headers=headers, allow_redirects=True, proxies=None, timeout=5) try: self.raw_html = self.req.text self.status = True return True except Exception, e: self.error_type = "Request answer was not understood %s" % e self.status_code = 400 self.status = False return False else: self.error_type = "Not relevant" self.status_code = 0 self.status = True return False
import matplotlib.pyplot as plt from utils import Filter import warnings warnings.filterwarnings("ignore") import warnings warnings.filterwarnings("ignore") # ## Impact of commits performance file_path = '../data/dataset_round1.csv' df = pd.read_csv(file_path) refactorings = df.columns[5:-5] # Consider only datapoints with a single refactoring type f = Filter(df, refactorings) df = df[f.singlereftype()] # Include addition dataset (second round data collection) df_singleref = pd.read_csv('../data/dataset_round2.csv') df = pd.concat([df_singleref, df]) refactorings = df.columns[5:-5] ## performance relative change threshold change_threshold = 0.01 / (10**20) def which_refactoring(row): num_ref = max(row[r] for r in refactorings) assert (num_ref > 0) assert (sum(row[r] for r in refactorings) == num_ref)