def aggregate_preprocess_results(codes, dict_edits, dict_newcomers, dict_reverts): # df_topics, topics = load_topics(path_topics) aggs = [] for code in codes: start = time.time() try: df_gb = process_edits(dict_edits, code) # group edits df_gb.rename({"title": "index", 'event_user_id': 'count', 'revision_text_bytes_diff': 'rev_len_sum'}, inplace=True, axis=1) final = df_gb.groupby(["date", "covid", "user_kind"]).sum().reset_index() final = process_newcomers(dict_newcomers, code, final) final = process_reverts(dict_reverts, code, final) final = final.fillna(0) final["code"] = code aggs.append(final.loc[:, final.columns != 'index']) except Exception as e: traceback.print_exc() Logger.instance('pipeline').info(f'Error for {code}: {str(e)}') Logger.instance('pipeline').info(f'Processing {code} took {time.time() - start}') final_aggs = pd.concat(aggs) return final_aggs
def worker(self, ip): hosts = self.config['hosts'].split(',') host_records = self._get_records(self.zones[self.zone]) put_url = f"zones/{self.zones[self.zone]}/records" for host in hosts: if host == '@': host = self.zone else: host += '.' + self.zone if not host.endswith('.'): host += '.' if host not in host_records: Logger.warning(f"Attempted to update host '{host}' " "that is not found under this account!") continue data = { "type": "A", "name": host, "content": ip, "ttl": self.ttl, } ret = self.rest.put(f"{put_url}/{host_records[host]}", data) if not ret: Logger.error(f"Unable to update host record for '{host}' at " "zone '{self.zone}'") continue
def run(cls, file, *args, **kwargs): """ Run this command and return a generator of filtered output lines Note: This is not profilable so subclasses should override it to make sure it's taken into account by --profile """ regex = cls.make_regex(file) cmd = cls.make_cmd(file, *args, **kwargs) # Handle empty commands to save some time if cmd is None: return iter(()) Logger.debug("Running command {}".format(" ".join(cmd))) process = subprocess.run(cmd, shell=False, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if process.stderr: text = "Error while running command \"{}\": {}".format( " ".join(cmd), process.stderr.decode("utf-8")) Logger.warn(text) return cls._filter(process.stdout.splitlines(True), regex)
def _compare(config, delay_output, pair): """ Compare a pair of files and return either None if they are equal or too similar, or a tuple of (path1, path2, distance) otherwise """ file1, file2 = pair Logger.progress("Comparing {} and {}...".format(file1.relative_path, file2.relative_path)) if FileComparator.are_equal(file1, file2): return if config.compute_distance: distance = FilesetComparator.compute_distance(file1, file2) else: distance = None if distance is not None and distance < config.min_dist: # Ignore files that are too similar return edit = (file1, file2, distance) # Start printing files if we can, so user doesn't have to wait too long if not delay_output: output_change(edit, config) return edit
def __init__(self, opt): self.opt = opt self.logger = Logger(opt.model_name, opt.dataset) if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert( opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': self.logger.log('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def serverlistener(in_q): while True: # Get some data data = in_q.get() if data == "KILLSERVERCOMMAND": t1.isAlive = False download_thread.isAlive = False Logger("Server reports killed", Logger.INFO) Logger("Exiting program! Bye. ", Logger.INFO) exit(0) # Process the data args = argparse.Namespace(project=[data], disable_server=False, log_warnings=False, log_errors=False, disable_browser=True) # On windows: write arguments to file, spawn process, read arguments from file, delete. if os.name == 'nt': with open('.temp_thread_file', 'a') as the_file: the_file.write(data+"\n") the_file.write("False\n") # disable_server the_file.write("False\n") # log_warnings the_file.write("True\n") else: with open('.temp_thread_file', 'a') as the_file: the_file.write("filling") p = Process(target=program, args=(args,)) p.start()
def start_workflow(shared_state, start_date, review_number=0): db_connection = setup_db().connect() logger = Logger(db_connection) shared_state.job_id = None shared_state.completed = False max_downloads = environ.get('MAX_DOWNLOADS') if max_downloads is not None: max_downloads = int(max_downloads) max_upload_workers = int(environ.get('MAX_UPLOADERS', 20)) try: workflow = Workflow( db_connection, logger, start_date, max_downloads, max_upload_workers, environ.get('ALLOW_REPEAT', 'FALSE') == 'TRUE' ) workflow.start(shared_state) except Exception: logger.exception() if shared_state.job_id is not None: job_serializer = Serializer(db_connection, job) job_serializer.put(shared_state.job_id, { 'status': JobStatus.FAILED, })
def importExclList(self, filename): try: with open(filename, "r") as file: lines_in_file = file.read().splitlines() except IOError: Logger("could not open file '%s'." % filename, Logger.ERROR) return list() line_index = 1 try: for line in lines_in_file: searchword = line.split('|||')[0] if len(line.split('|||')) > 2: comment = line.split('|||')[2] else: comment = "" dir_list_with_quotes = str(line.split('|||')[1]).split(',') dir_list_without_quotes = [] for item in dir_list_with_quotes: dir_list_without_quotes.append(item.strip("\"")) self.addExclItem(searchword, comment, os.path.join(*dir_list_without_quotes)) line_index = line_index + 1 except IOError: Logger("Format is not readable or file is missing: %s." % filename, Logger.ERROR) sys.exit()
def importList(self, filename): try: with open(filename, "r") as file: lines_in_file = file.read().splitlines() except IOError: Logger("could not open file '%s'." % filename, Logger.ERROR) return list() line_index = 1 try: for line in lines_in_file: if line.split('|||')[1]: searchword = line.split('|||')[0] if line.split('|||')[1]: importance = int(line.split('|||')[1]) else: importance = 20 if len(line.split('|||')) > 2: comment = line.split('|||')[2] else: comment = "" if "owasp_static_android.txt" in filename: owasp = True else: owasp = False self.addSearchItem(searchword, importance, comment, owasp) line_index = line_index + 1 except IOError: Logger("Format is not readable or file is missing: %s." % filename, Logger.ERROR) sys.exit() pass
def get_q_string(self, query_string): """ :param query_string: query string in dict, where key is LHS, and value is RHS for each query string pair :returns: Str response. If error, returns None """ q_string = '?' for k, v in query_string.items(): q_string += f"{quote(k)}={quote(v)}&" try: res = requests.get(f"{self.base_url}{q_string}") ''' Return the response text if everything goes correctly and the server response with 200 HTTP code. ''' if res.status_code == 200: return res.text if res.status_code != 200: Logger.error(f"Status code is {res.status_code}." f"Response: {res.text}") return None except requests.exceptions.RequestException as e: Logger.error(f"Request failed: {e}") return None
def getHistoricalIntradayByMinute(ticker, day=None): historicalIntradayData = {} try: historicalIntradayData = get_historical_intraday(ticker, day) except Exception as ex: Logger.error('Failed querying IEX historical intraday data for {}'.format(ticker)) return historicalIntradayData
def get(self, endpoint): """ :param endpoint: REST API endpoint to GET :returns: Str response. If error, returns None """ try: res = requests.get(f"{self.base_url}/{endpoint}", auth=self.auth, headers=self.headers) ''' Return the response text if everything goes correctly and the server response with 200 HTTP code. ''' if res.status_code == 200: return res.text if res.status_code != 200: Logger.error("ERROR: Status code is {res.status_code}." f"Response: {res.text}") return None except requests.exceptions.RequestException as e: Logger.error(f"Request failed: {e}") return None
def do_POST(self): """Serve a POST request.""" if re.findall(r'KILLSERVERCOMMAND', self.requestline): ServerWrapper.dragdropserver.q.put("KILLSERVERCOMMAND") Logger("Server upload killed", Logger.INFO) self.send_response(200) exit(0) return True, "Exit" r, info = self.deal_post_data() Logger((str(r) + str(info) + "by: " + str(self.client_address)), Logger.INFO) f = BytesIO() f.write(b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">') f.write(b"<html>\n<title>Upload Result Page</title>\n") f.write(b"<body>\n<h2>Upload Result Page</h2>\n") f.write(b"<hr>\n") if r: f.write(b"<strong>Success:</strong>") else: f.write(b"<strong>Failed:</strong>") f.write(info.encode()) f.write(("<br><a href=\"%s\">back</a>" % self.headers['referer']).encode()) length = f.tell() f.seek(0) self.send_response(200) self.send_header("Content-type", "text/html") self.send_header("Content-Length", str(length)) self.end_headers() if f: self.copyfile(f, self.wfile) f.close()
def __init__(self, botId): self.logger = Logger("arbitrage") self.logger.info("#" + str(botId) + ": Initializing.") self.bot_manager = BotManager() self.exchange_manager = ExchangeManager() self.arbitrage_manager = ArbitrageManager() self.bot = self.bot_manager.get_bot(botId) self.arbitrage = self.arbitrage_manager.get_arbitrage(botId)
def getPublic(): try: response = requests.get("https://ipinfo.io") if response.status_code == 200: ip = response.json() return ip['ip'] except requests.exceptions.RequestException as e: Logger.error("Unable to get public IP address!") Logger.error(e) sys, exit(1)
def persist_image(target_folder: str, url: str, img_name: str, logger: Logger): try: image_content = requests.get(url).content image_file = BytesIO(image_content) image = Image.open(image_file).convert('RGB') file_path = path.join(target_folder, f'{img_name}.png') with open(file_path, 'wb') as f: image.save(f, 'PNG', quality=100) logger.infoSuccess(f'SUCCESS - saved {url} - as {file_path}') except Exception as e: logger.infoDanger(f'ERROR - Cound now download {url} - {e}')
def volumeOverYearLineGraph(ticker, year): dailyData = historical_daily.getByTickerAndYear(ticker, year) seriesData = {} for row in dailyData: seriesData.update({pd.to_datetime(row.date): row.data.get('volume')}) openingByDaySeries = pd.Series(seriesData) graph = openingByDaySeries.plot.line() graph.set_xlabel("Day") graph.set_ylabel("Volume") Logger.writeGraphToFile(graph.get_figure(), "_".join([ticker, str(year), "Volume"]))
def find_matches_in_src_file(self, CODE_OFFSET, QUERY_IMPORTANCE): try: if len(self.file_path.encode('unicode_escape').decode()) > 255: Logger( "Filepath is too big. Try moving the StaCoAn folder to the root of your drive, make the APK name shorter and try again. The following file will be ignored to let StaCoAn continue: '%s'" % self.file_path, Logger.WARNING) else: with open(self.file_path, "r", encoding="utf8", errors='ignore') as file: lines_in_file = file.read().splitlines() line_index = 1 for line in lines_in_file: for listItem in SearchLists.all_lists[ "SRC_WORDS"].ListCollection: if int(listItem.importance) > QUERY_IMPORTANCE: # if re.match(File.non_regex_indicator, listItem.searchword): # Searchwords.src_search_words[query].regex = True if re.search(listItem.searchword, line, re.IGNORECASE): exclude = False for ExclItem in SearchLists.all_lists[ "EXCL_WORDS"].ListCollection: if re.search(ExclItem.searchword, line, re.IGNORECASE): if (ExclItem.dir in self.file_path or (ExclItem.dir == "" or ExclItem.dir is None)): # Logger("SRC exclusion found: %s in file %s" % (str(ExclItem.searchword), self.file_path), # Logger.INFO) exclude = True if exclude == False: upper_range = min(line_index + CODE_OFFSET, len(lines_in_file) + 1) lower_range = max( line_index - CODE_OFFSET - 1, 1) src_match = MatchSource( listItem.searchword, line_index, lines_in_file[lower_range:upper_range], listItem.importance, len(lines_in_file), listItem.owasp, listItem.comment) self.all_matches.append(src_match) self.src_matches.append(src_match) line_index = line_index + 1 self.orden_matches() except IOError as e: Logger( "could not open file '%s'. Error:" % (self.file_path, e.strerror), Logger.WARNING) return list()
def worker(self, ip): ''' Make the post request for each record that we have in records list ''' for host in self.records: host['ipv4Address'] = ip Response = self.rest.post('/dns/' + str(host['id']), host) if Response == True: Logger.info(f"Updating IP address {ip} to Dynu.net record {host['name']}") else: Logger.Error(f"Failed to update IP address {ip} to Dynu.net record {host['name']}")
def get_json(self, endpoint): """ :param endpoint: REST API endpoint to GET :returns: dict representing json response. If error, returns None """ resp = self.get(endpoint) json_resp = None try: json_resp = json.loads(resp) except json.JSONDecodeError: Logger.error(f"Invalid response to request at endpoint {endpoint}") return json_resp
def is_increasing_consecutively(symbol, iexHistoricalData, repeats): Logger.debug('analyzing {} for {} repeated increasing minutes'.format( symbol, repeats)) chainLength = 0 failedChains = 0 beLessThanMeChainHead = None for i in range(len(iexHistoricalData) - 1, -1, -1): minuteData = iexHistoricalData[i] currentMinuteAverage = minuteData.get('average') #many trading minutes have no activity if (currentMinuteAverage != None): if (beLessThanMeChainHead == None): beLessThanMeChainHead = currentMinuteAverage elif (currentMinuteAverage < beLessThanMeChainHead): Logger.debug('increasing consecutive count') chainLength += 1 elif (currentMinuteAverage >= beLessThanMeChainHead): chainLength = 0 failedChains += 1 if (chainLength >= repeats): Logger.debug( 'successful increasing successfully check on {} for {} repeats' .format(symbol, repeats)) return True if (failedChains >= 3): Logger.debug( 'aborting consecutive analysis of {}'.format(symbol)) return False beLessThanMeChainHead = currentMinuteAverage return False
def worker(self, ip): hosts = self.config['hosts'].split(',') for host in hosts: if host == '@': host = self.zone else: host += '.' + self.zone if host not in self._zones: Logger.warning(f"Attempted to update host '{host}' " "that is not found under this account!") continue ret = HttpProvider.get(self.update_url + host) if not ret: Logger.error("Unable to update host record " f"for '{host}' at zone '{self.zone}'") continue
def find_matches_in_db_file(self): # Set icon of file self.icon = "insert_invitation" self.fa_icon = "database" db = sqlite3.connect(self.file_path) cursor = db.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") tables = cursor.fetchall() for table_name in tables: table_name = table_name[0] cursor = db.execute("SELECT * from %s" % table_name) line = 0 for row in cursor.fetchall(): line += 1 for matchword in Searchwords.db_search_words: if matchword in str(row): for item in Searchwords.exclusion_list: if item[0] == matchword and item[1] in self.file_path: Logger("Exclusion found: %s in file %s" % (str(item[0]), self.file_path)) else: importance = Searchwords.db_search_words[matchword] db_match = MatchDatabase(matchword, line, str(table_name), str(row), importance) self.db_matches.append(db_match) self.all_matches.append(db_match) self.orden_matches()
def log_request(self, code='-', size='-'): if not any( s in str(self.requestline) for s in ('lootbox.html', '.ico', 'robots.txt', '.js', '.css', 'start.html', '.woff2', '.png', '.jpg')): Logger(self.requestline + " " + str(code) + " " + str(size), Logger.INFO)
def log_error(self, format, *args): if not any(s in str(self.requestline) for s in ('lootbox.html', 'robots.txt')): Logger(("%s - - [%s] %s - %s\n" % (self.address_string(), self.log_date_time_string(), format % args, str(self.requestline))), Logger.WARNING)
def run(self): threadCount = 10 budgetPerThread = self.budget / threadCount stockQueue = Queue() for stock in iex.getAllTickers(): stockQueue.put(stock) threads = [] for i in range(threadCount): t = threading.Thread(target=BetaTrades.quick_trade_one, args=[budgetPerThread, stockQueue]) # t = threading.Thread(target=callme, args=[i]) threads.append(t) t.start() Logger.trace('thread count is ') Logger.trace(threading.activeCount())
def __init__(self, estimator, target, below_loss_margin, over_loss_margin, limit_loss_N): self.estimator = estimator self.target = target self.below_loss_margin = below_loss_margin self.over_loss_margin = over_loss_margin self.limit_loss_N = limit_loss_N self.logger = Logger.get_instance(conf.LOG_BASE_NAME)
def create_drag_drop_server(): Logger( "serving dragdrop server at port: " + str(ServerWrapper.DRAG_DROP_SERVER_PORT), Logger.INFO) return socketserver.TCPServer( ("", ServerWrapper.DRAG_DROP_SERVER_PORT), RequestHandlerClass=ServerWrapper.dragdropserver)
def create_reportserver(): Logger( "serving report server at port: " + str(ServerWrapper.REPORT_SERVER_PORT), Logger.INFO) return socketserver.TCPServer( ("", ServerWrapper.REPORT_SERVER_PORT), RequestHandlerClass=ServerWrapper.reportserver)
def hook_after_request(response): route = request.path logging_method = Logger.error if response.status_code not in info_status_list else Logger.info Logger.debug(request) try: req_body = request.get_json() except Exception: req_body = 'No JSON in request' res_body = response.get_data().decode('utf-8').rstrip() logging_method( message='\nRequest: {}\nResponse: {}'.format(req_body, res_body), route=route, method=request.method, res_code=response.status_code ) return response
sys.path.append("..") sys.path.append(".") from config import * from helpers.basics import load_config from helpers.logger import Logger from helpers.db_helpers import * ## Script specific import sys import logging import collections import datetime if "log" not in globals(): log = Logger.init_logger('STATS_%s'%(cfg.language_code), load_config()) def main(): # mongodb stats can be obtained with cfg.db.command("collstats","dict") db_stats={ 'A/ report_date':datetime.datetime.now(), 'B/ Number of samples':samples_col.count(), 'C/ Number of normalized measures':measurements_col.count(), 'C_a/ Tally of normalized measures':measurements_col.aggregate([{"$group":{"_id":"$type", "count": { "$sum": 1 }}}])['result'], 'D/ Number of species':species_col.count(),
#!/usr/bin/env python # encoding: utf-8 import sys sys.path.append("..") sys.path.append(".") from config import * from helpers.basics import load_config from helpers.logger import Logger from helpers.db_helpers import * from helpers.path import data_dir # Script supposed to be run in the background to populate the DB with available datasets if "log" not in globals(): logger = Logger.init_logger('PROCESS_MAPPINGS_%s'%(cfg.language_code), load_config()) # Script supposed to be run in the background to populate the DB with available datasets logger.info("Running %s",sys.argv[0]) # Get available mappings and process them mappings_to_process=mappings_col.find({"src_to_tgt":{"$exists":False}}) logger.info("Found %d mappings to process",mappings_to_process.count())
#!/usr/bin/env python # encoding: utf-8 import sys sys.path.append("..") sys.path.append(".") from config import * from helpers.basics import load_config from helpers.logger import Logger from helpers.db_helpers import * # Script import datetime if "log" not in globals(): log = Logger.init_logger('SAMPLE_DATA_%s'%(cfg.language_code), load_config()) # clear db species_col.remove() publications_col.remove() samples_col.remove() mappings_col.remove() measurements_col.remove() #### Melon # species
sys.path.append("..") sys.path.append(".") from config import * from helpers.basics import load_config from helpers.logger import Logger from helpers.db_helpers import * from helpers.path import data_dir # Script supposed to be run in the background to populate the DB with available datasets ## Setup from numbers import Number import collections from math import log if "log" not in globals(): logger = Logger.init_logger('DATA_PROCESSOR_%s'%(cfg.language_code), load_config()) logger.info("Running %s",sys.argv[0]) # Get available datasets and insert them in the DB # a_sample=samples_col.find_one({"experimental_results.values":{"$exists":False}}) samples_to_process=samples_col.find({"experimental_results":{"$elemMatch":{"values":{"$exists":False}}}}) logger.info("Found %d samples to process",samples_to_process.count()) for a_sample in samples_to_process: logger.info("Will process dataset for experiment %s",a_sample['name']) parser_config=a_sample['xls_parsing'] for a_result_idx,a_result in [(i,x) for i,x in enumerate(a_sample['experimental_results']) if "values" not in x]: # specialize parser for the result
class Hemnet() : def __init__(self): self.log = Logger("Hemnet"); self.request = Request(); #Base objects for searches and results self.baseUrl = "http://www.hemnet.se"; self.baseSearch = self.baseUrl + "/sok/create"; self.baseLocation = self.baseUrl + "/locations/show?"; self.baseResult = self.baseUrl + "/resultat"; self.searchQuery = {} #Basetype, english -> Swedish self.translatedTypes = { "municipality" : "Kommun", "district" : u"Område", "postal_city" : "Stadsdel", "region" : u"Län", "street" : "Gata", "city" : "Stad" } #BaseAverageTypes -> Swedish self.translatedAverageTypes = { "age" : u"List ålder", "price" : "Medelpris", "price_m2" : u"Pris per m²", "size" : u"Storlek (m²)", "rooms" : "Antal rum", "fee" : u"Månadsavgift", "price_change_up" : u"Prisökning (%)", "price_change_down" : u"Prissäkning (%)" } #searchTypes self.searchTypes = { "f" : "fritidshus", "v" : "villa", "t" : "tomt", "r" : "radhus", "g" : "gard", "b" : "bostadsratt", "o" : "other", "a" : "all" } #Items to get average for self.itemAverageTypes = { "age" : 0, "price" : 0, "price_m2" : 0, "size" : 0, "rooms" : 0, "fee" : 0, "price_change_up" : 0, "price_change_down" : 0 }; #Base result format self.resultFormat = { "totalItems" : 0, "results" : {} }; self.log.info("Initiated Hemnet"); ''' Searchdata is a formpost in a very specific format ''' def createSearchFormData(self, data, specificType = 'a') : locationData = [{ "id": (data.get("id")), "name": (data.get("name")), "parent_id": (data.get("parent_location").get("id")), "parent_name": (data.get("parent_location").get("name")) }] searchData = { "search[location_search]" : locationData, "search[location_ids][]": data.get("id"), "search[region_id]":-1, "search[municipality_ids][]":-1, "search[country_id]":0, "search[item_types][]": "%s" % self.searchTypes[specificType], "search[price_min]": '', "search[price_max]": '', "search[fee_max]": '', "search[rooms_min]": '', "search[living_area_min]": '', "search[keywords]":'', "commit": '' } return searchData; def searchRequest(self, query) : return self.request.postRequest(self.baseSearch, query); ''' Pass a list of keys and a dict of data to caluclate average value for each key ''' def avgByKey(self, keys, data): final = {} for d in data: for k in d.keys(): if k in keys: final[k] = final.get(k,0) + d[k] for k in final.keys(): final[k] = final[k]/len(data); return final; def getLocationQueryURL(self, query): return "%sq=%s" % (self.baseLocation, urllib.quote(query.encode('utf-8'))) @cache.methodcache.cache('findLocations', expire=72000) def findLocations(self, query, extra, area = None) : queryURL = self.getLocationQueryURL(query); cacheResult = cache.locations.get(hashlib.md5(queryURL).hexdigest()); if( cacheResult is not None): print "Found cached loc"; return cacheResult; locFormData = [] locResponse = self.request.getResponse(queryURL, None) jdata = json.loads(locResponse); print json.dumps(jdata, indent=4); formData = {} locations = [] for id, item in enumerate(jdata) : item["score"] = Levenshtein.ratio(item.get("location").get("name"), query) if( area is not None ): if( item.get("location").get("parent_location").get("name").find(area) != -1 ): formData = self.createSearchFormData(item.get("location"), extra); locations.append(item) locFormData.append(formData); else: formData = self.createSearchFormData(item.get("location"), extra); locations.append(item) locFormData.append(formData); locations = sorted(locations, key=itemgetter('score'), reverse=True) result = {'search' : locFormData, 'area' : area, 'locations' : locations }; cache.locations[hashlib.md5(queryURL).hexdigest()] = result return result; @cache.methodcache.cache('performSearch', expire=72000) def performSearch(self, searchData): hashkey = hashlib.md5( json.dumps(searchData, sort_keys=True) ).hexdigest(); cachedResult = cache.storage.get(hashkey); if(cachedResult is not None): print "Found cached searchResponse"; return cachedResult; print "Performing search on " + json.dumps(searchData, indent=4); searchRequest = self.searchRequest(searchData); searchResponse = self.request.getUnicodeDoc(searchRequest); resultData = self.parseResult(searchResponse, self.resultFormat); result = self.createResultItem(resultData); print "Storing hash " + hashkey; chart_list = cache.storage.get(hashkey, {}) # metadata is the chart item minus the actual list plus a size metadata_keys = filter(lambda k: k != 'list', result.keys()) metadata = { key: result[key] for key in metadata_keys } chart_list[hashkey] = metadata cache.storage[hashkey] = chart_list[hashkey] return result; def parseResult(self, doc, brokers = {}) : brokers = self.parseItems(doc.xpath("//div[contains(@class, 'item result')]"), brokers); nextpage = doc.xpath('//a[@class="next_page"]'); try: url = nextpage[0].attrib["href"]; if url is not None: self.log.info("Parsing %s" % url); nextDoc = self.request.requestUnicodeDoc(self.baseUrl + url); self.parseResult(nextDoc, brokers); except Exception,e: self.log.debug("ParseResult %s" % e) pass; return brokers;
#!/usr/bin/env python # encoding: utf-8 import sys sys.path.append("..") sys.path.append(".") from config import * from helpers.basics import load_config from helpers.logger import Logger from helpers.db_helpers import * # Script supposed to be run in the background to populate the DB with available datasets if "log" not in globals(): logger = Logger.init_logger('FLATTEN_%s'%(cfg.language_code), load_config()) logger.info("Running %s",sys.argv[0]) logger.info("Flattening and normalizing experimental results") already_existing_xp=measurements_col.distinct("xp") samples_with_results=samples_col.find({"experimental_results":{"$elemMatch":{"values":{"$exists":True}}}}) # a_sample=samples_with_results[0] n_op=0 measurements_to_insert=measurements_col.initialize_unordered_bulk_op() for a_sample in samples_with_results: # i,experimental_results=enumerate(a_sample['experimental_results']).next() for i,experimental_results in enumerate(a_sample['experimental_results']):
def __init__(self): self.log = Logger("Hemnet"); self.request = Request(); #Base objects for searches and results self.baseUrl = "http://www.hemnet.se"; self.baseSearch = self.baseUrl + "/sok/create"; self.baseLocation = self.baseUrl + "/locations/show?"; self.baseResult = self.baseUrl + "/resultat"; self.searchQuery = {} #Basetype, english -> Swedish self.translatedTypes = { "municipality" : "Kommun", "district" : u"Område", "postal_city" : "Stadsdel", "region" : u"Län", "street" : "Gata", "city" : "Stad" } #BaseAverageTypes -> Swedish self.translatedAverageTypes = { "age" : u"List ålder", "price" : "Medelpris", "price_m2" : u"Pris per m²", "size" : u"Storlek (m²)", "rooms" : "Antal rum", "fee" : u"Månadsavgift", "price_change_up" : u"Prisökning (%)", "price_change_down" : u"Prissäkning (%)" } #searchTypes self.searchTypes = { "f" : "fritidshus", "v" : "villa", "t" : "tomt", "r" : "radhus", "g" : "gard", "b" : "bostadsratt", "o" : "other", "a" : "all" } #Items to get average for self.itemAverageTypes = { "age" : 0, "price" : 0, "price_m2" : 0, "size" : 0, "rooms" : 0, "fee" : 0, "price_change_up" : 0, "price_change_down" : 0 }; #Base result format self.resultFormat = { "totalItems" : 0, "results" : {} }; self.log.info("Initiated Hemnet");