Example #1
0
def aggregate_preprocess_results(codes, dict_edits, dict_newcomers, dict_reverts):
    # df_topics, topics = load_topics(path_topics)
    aggs = []

    for code in codes:
        start = time.time()
        try:
            df_gb = process_edits(dict_edits, code)
            # group edits
            df_gb.rename({"title": "index", 'event_user_id': 'count', 'revision_text_bytes_diff': 'rev_len_sum'},
                         inplace=True, axis=1)
            final = df_gb.groupby(["date", "covid", "user_kind"]).sum().reset_index()

            final = process_newcomers(dict_newcomers, code, final)
            final = process_reverts(dict_reverts, code, final)

            final = final.fillna(0)
            final["code"] = code
            aggs.append(final.loc[:, final.columns != 'index'])
        except Exception as e:
            traceback.print_exc()
            Logger.instance('pipeline').info(f'Error for {code}: {str(e)}')
        Logger.instance('pipeline').info(f'Processing {code} took {time.time() - start}')
    final_aggs = pd.concat(aggs)

    return final_aggs
Example #2
0
 def worker(self, ip):
     hosts = self.config['hosts'].split(',')
     host_records = self._get_records(self.zones[self.zone])
     put_url = f"zones/{self.zones[self.zone]}/records"
     for host in hosts:
         if host == '@':
             host = self.zone
         else:
             host += '.' + self.zone
         if not host.endswith('.'):
             host += '.'
         if host not in host_records:
             Logger.warning(f"Attempted to update host '{host}' "
                            "that is not found under this account!")
             continue
         data = {
             "type": "A",
             "name": host,
             "content": ip,
             "ttl": self.ttl,
         }
         ret = self.rest.put(f"{put_url}/{host_records[host]}", data)
         if not ret:
             Logger.error(f"Unable to update host record for '{host}' at "
                          "zone '{self.zone}'")
             continue
Example #3
0
    def run(cls, file, *args, **kwargs):
        """
        Run this command and return a generator of filtered output lines
        Note: This is not profilable so subclasses should override it to make
        sure it's taken into account by --profile
        """
        regex = cls.make_regex(file)
        cmd = cls.make_cmd(file, *args, **kwargs)

        # Handle empty commands to save some time
        if cmd is None:
            return iter(())

        Logger.debug("Running command {}".format(" ".join(cmd)))
        process = subprocess.run(cmd,
                                 shell=False,
                                 close_fds=True,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

        if process.stderr:
            text = "Error while running command \"{}\": {}".format(
                " ".join(cmd), process.stderr.decode("utf-8"))
            Logger.warn(text)

        return cls._filter(process.stdout.splitlines(True), regex)
Example #4
0
def _compare(config, delay_output, pair):
    """
    Compare a pair of files and return either None if they are equal
    or too similar, or a tuple of (path1, path2, distance) otherwise
    """
    file1, file2 = pair
    Logger.progress("Comparing {} and {}...".format(file1.relative_path,
                                                    file2.relative_path))

    if FileComparator.are_equal(file1, file2):
        return

    if config.compute_distance:
        distance = FilesetComparator.compute_distance(file1, file2)
    else:
        distance = None

    if distance is not None and distance < config.min_dist:
        # Ignore files that are too similar
        return

    edit = (file1, file2, distance)

    # Start printing files if we can, so user doesn't have to wait too long
    if not delay_output:
        output_change(edit, config)

    return edit
Example #5
0
    def __init__(self, opt):

        self.opt = opt
        self.logger = Logger(opt.model_name, opt.dataset)

        if 'bert' in opt.model_name:
            tokenizer = Tokenizer4Bert(
                opt.max_seq_len,
                opt.pretrained_bert_name)
            bert = BertModel.from_pretrained(opt.pretrained_bert_name)
            self.model = opt.model_class(bert, opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            self.logger.log('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
Example #6
0
        def serverlistener(in_q):
            while True:
                # Get some data
                data = in_q.get()
                if data == "KILLSERVERCOMMAND":
                    t1.isAlive = False
                    download_thread.isAlive = False
                    Logger("Server reports killed", Logger.INFO)
                    Logger("Exiting program! Bye. ", Logger.INFO)
                    exit(0)

                # Process the data
                args = argparse.Namespace(project=[data], disable_server=False, log_warnings=False, log_errors=False, disable_browser=True)
                # On windows: write arguments to file, spawn process, read arguments from file, delete.
                if os.name == 'nt':
                    with open('.temp_thread_file', 'a') as the_file:
                        the_file.write(data+"\n")
                        the_file.write("False\n") # disable_server
                        the_file.write("False\n")  # log_warnings
                        the_file.write("True\n")
                else:
                    with open('.temp_thread_file', 'a') as the_file:
                        the_file.write("filling")

                p = Process(target=program, args=(args,))
                p.start()
Example #7
0
def start_workflow(shared_state, start_date, review_number=0):
    db_connection = setup_db().connect()
    logger = Logger(db_connection)

    shared_state.job_id = None
    shared_state.completed = False

    max_downloads = environ.get('MAX_DOWNLOADS')
    if max_downloads is not None:
        max_downloads = int(max_downloads)
    max_upload_workers = int(environ.get('MAX_UPLOADERS', 20))

    try:
        workflow = Workflow(
            db_connection, logger,
            start_date,
            max_downloads, max_upload_workers,
            environ.get('ALLOW_REPEAT', 'FALSE') == 'TRUE'
        )
        workflow.start(shared_state)
    except Exception:
        logger.exception()
        if shared_state.job_id is not None:
            job_serializer = Serializer(db_connection, job)
            job_serializer.put(shared_state.job_id, {
                'status': JobStatus.FAILED,
            })
Example #8
0
    def importExclList(self, filename):
        try:
            with open(filename, "r") as file:
                lines_in_file = file.read().splitlines()
        except IOError:
            Logger("could not open file '%s'." % filename, Logger.ERROR)
            return list()
        line_index = 1
        try:
            for line in lines_in_file:

                searchword = line.split('|||')[0]
                if len(line.split('|||')) > 2:
                    comment = line.split('|||')[2]
                else:
                    comment = ""
                dir_list_with_quotes = str(line.split('|||')[1]).split(',')
                dir_list_without_quotes = []
                for item in dir_list_with_quotes:
                    dir_list_without_quotes.append(item.strip("\""))
                self.addExclItem(searchword, comment,
                                 os.path.join(*dir_list_without_quotes))
                line_index = line_index + 1
        except IOError:
            Logger("Format is not readable or file is missing: %s." % filename,
                   Logger.ERROR)
            sys.exit()
Example #9
0
 def importList(self, filename):
     try:
         with open(filename, "r") as file:
             lines_in_file = file.read().splitlines()
     except IOError:
         Logger("could not open file '%s'." % filename, Logger.ERROR)
         return list()
     line_index = 1
     try:
         for line in lines_in_file:
             if line.split('|||')[1]:
                 searchword = line.split('|||')[0]
                 if line.split('|||')[1]:
                     importance = int(line.split('|||')[1])
                 else:
                     importance = 20
                 if len(line.split('|||')) > 2:
                     comment = line.split('|||')[2]
                 else:
                     comment = ""
                 if "owasp_static_android.txt" in filename:
                     owasp = True
                 else:
                     owasp = False
                 self.addSearchItem(searchword, importance, comment, owasp)
             line_index = line_index + 1
     except IOError:
         Logger("Format is not readable or file is missing: %s." % filename,
                Logger.ERROR)
         sys.exit()
     pass
Example #10
0
    def get_q_string(self, query_string):
        """
        :param query_string: query string in dict, where key is LHS, and value
            is RHS for each query string pair
        :returns: Str response. If error, returns None
        """
        q_string = '?'
        for k, v in query_string.items():
            q_string += f"{quote(k)}={quote(v)}&"
        try:
            res = requests.get(f"{self.base_url}{q_string}")
            '''
            Return the response text if everything goes correctly
            and the server response with 200 HTTP code.
            '''
            if res.status_code == 200:
                return res.text
            if res.status_code != 200:
                Logger.error(f"Status code is {res.status_code}."
                             f"Response: {res.text}")
                return None

        except requests.exceptions.RequestException as e:
            Logger.error(f"Request failed: {e}")
            return None
Example #11
0
def getHistoricalIntradayByMinute(ticker, day=None):
    historicalIntradayData = {}
    try:
        historicalIntradayData = get_historical_intraday(ticker, day)
    except Exception as ex:
        Logger.error('Failed querying IEX historical intraday data for {}'.format(ticker))
    return historicalIntradayData
Example #12
0
    def get(self, endpoint):
        """
        :param endpoint: REST API endpoint to GET
        :returns: Str response. If error, returns None
        """
        try:

            res = requests.get(f"{self.base_url}/{endpoint}",
                               auth=self.auth,
                               headers=self.headers)
            '''
            Return the response text if everything goes correctly
            and the server response with 200 HTTP code.
            '''
            if res.status_code == 200:
                return res.text

            if res.status_code != 200:
                Logger.error("ERROR: Status code is {res.status_code}."
                             f"Response: {res.text}")
                return None

        except requests.exceptions.RequestException as e:
            Logger.error(f"Request failed: {e}")
            return None
Example #13
0
 def do_POST(self):
     """Serve a POST request."""
     if re.findall(r'KILLSERVERCOMMAND', self.requestline):
         ServerWrapper.dragdropserver.q.put("KILLSERVERCOMMAND")
         Logger("Server upload killed", Logger.INFO)
         self.send_response(200)
         exit(0)
         return True, "Exit"
     r, info = self.deal_post_data()
     Logger((str(r) + str(info) + "by: " + str(self.client_address)),
            Logger.INFO)
     f = BytesIO()
     f.write(b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
     f.write(b"<html>\n<title>Upload Result Page</title>\n")
     f.write(b"<body>\n<h2>Upload Result Page</h2>\n")
     f.write(b"<hr>\n")
     if r:
         f.write(b"<strong>Success:</strong>")
     else:
         f.write(b"<strong>Failed:</strong>")
     f.write(info.encode())
     f.write(("<br><a href=\"%s\">back</a>" %
              self.headers['referer']).encode())
     length = f.tell()
     f.seek(0)
     self.send_response(200)
     self.send_header("Content-type", "text/html")
     self.send_header("Content-Length", str(length))
     self.end_headers()
     if f:
         self.copyfile(f, self.wfile)
         f.close()
 def __init__(self, botId):
     self.logger = Logger("arbitrage")
     self.logger.info("#" + str(botId) + ": Initializing.")
     self.bot_manager = BotManager()
     self.exchange_manager = ExchangeManager()
     self.arbitrage_manager = ArbitrageManager()
     self.bot = self.bot_manager.get_bot(botId)
     self.arbitrage = self.arbitrage_manager.get_arbitrage(botId)
Example #15
0
    def getPublic():

        try:
            response = requests.get("https://ipinfo.io")
            if response.status_code == 200:
                ip = response.json()
                return ip['ip']
        except requests.exceptions.RequestException as e:
            Logger.error("Unable to get public IP address!")
            Logger.error(e)
            sys, exit(1)
def persist_image(target_folder: str, url: str, img_name: str, logger: Logger):
    try:
        image_content = requests.get(url).content
        image_file = BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = path.join(target_folder, f'{img_name}.png')
        with open(file_path, 'wb') as f:
            image.save(f, 'PNG', quality=100)
        logger.infoSuccess(f'SUCCESS - saved {url} - as {file_path}')
    except Exception as e:
        logger.infoDanger(f'ERROR - Cound now download {url} - {e}')
Example #17
0
def volumeOverYearLineGraph(ticker, year):
    dailyData = historical_daily.getByTickerAndYear(ticker, year)

    seriesData = {}
    for row in dailyData:
        seriesData.update({pd.to_datetime(row.date): row.data.get('volume')})
    openingByDaySeries = pd.Series(seriesData)
    graph = openingByDaySeries.plot.line()
    graph.set_xlabel("Day")
    graph.set_ylabel("Volume")
    Logger.writeGraphToFile(graph.get_figure(),
                            "_".join([ticker, str(year), "Volume"]))
Example #18
0
 def find_matches_in_src_file(self, CODE_OFFSET, QUERY_IMPORTANCE):
     try:
         if len(self.file_path.encode('unicode_escape').decode()) > 255:
             Logger(
                 "Filepath is too big. Try moving the StaCoAn folder to the root of your drive, make the APK name shorter and try again. The following file will be ignored to let StaCoAn continue: '%s'"
                 % self.file_path, Logger.WARNING)
         else:
             with open(self.file_path,
                       "r",
                       encoding="utf8",
                       errors='ignore') as file:
                 lines_in_file = file.read().splitlines()
             line_index = 1
             for line in lines_in_file:
                 for listItem in SearchLists.all_lists[
                         "SRC_WORDS"].ListCollection:
                     if int(listItem.importance) > QUERY_IMPORTANCE:
                         # if re.match(File.non_regex_indicator, listItem.searchword):
                         #     Searchwords.src_search_words[query].regex = True
                         if re.search(listItem.searchword, line,
                                      re.IGNORECASE):
                             exclude = False
                             for ExclItem in SearchLists.all_lists[
                                     "EXCL_WORDS"].ListCollection:
                                 if re.search(ExclItem.searchword, line,
                                              re.IGNORECASE):
                                     if (ExclItem.dir in self.file_path
                                             or (ExclItem.dir == ""
                                                 or ExclItem.dir is None)):
                                         # Logger("SRC exclusion found: %s in file %s" % (str(ExclItem.searchword), self.file_path),
                                         #       Logger.INFO)
                                         exclude = True
                             if exclude == False:
                                 upper_range = min(line_index + CODE_OFFSET,
                                                   len(lines_in_file) + 1)
                                 lower_range = max(
                                     line_index - CODE_OFFSET - 1, 1)
                                 src_match = MatchSource(
                                     listItem.searchword, line_index,
                                     lines_in_file[lower_range:upper_range],
                                     listItem.importance,
                                     len(lines_in_file), listItem.owasp,
                                     listItem.comment)
                                 self.all_matches.append(src_match)
                                 self.src_matches.append(src_match)
                 line_index = line_index + 1
             self.orden_matches()
     except IOError as e:
         Logger(
             "could not open file '%s'. Error:" %
             (self.file_path, e.strerror), Logger.WARNING)
         return list()
Example #19
0
    def worker(self, ip):

        '''
        Make the post request for each record that
        we have in records list
        '''
        for host in self.records:
            host['ipv4Address'] = ip
            Response = self.rest.post('/dns/' + str(host['id']), host)
            if Response == True:
                Logger.info(f"Updating IP address {ip} to Dynu.net record {host['name']}")
            else:
                Logger.Error(f"Failed to update IP address {ip} to Dynu.net record {host['name']}")
Example #20
0
    def get_json(self, endpoint):
        """
        :param endpoint: REST API endpoint to GET
        :returns: dict representing json response. If error, returns None
        """
        resp = self.get(endpoint)
        json_resp = None

        try:
            json_resp = json.loads(resp)
        except json.JSONDecodeError:
            Logger.error(f"Invalid response to request at endpoint {endpoint}")
        return json_resp
Example #21
0
    def is_increasing_consecutively(symbol, iexHistoricalData, repeats):
        Logger.debug('analyzing {} for {} repeated increasing minutes'.format(
            symbol, repeats))
        chainLength = 0
        failedChains = 0
        beLessThanMeChainHead = None
        for i in range(len(iexHistoricalData) - 1, -1, -1):
            minuteData = iexHistoricalData[i]
            currentMinuteAverage = minuteData.get('average')

            #many trading minutes have no activity
            if (currentMinuteAverage != None):
                if (beLessThanMeChainHead == None):
                    beLessThanMeChainHead = currentMinuteAverage
                elif (currentMinuteAverage < beLessThanMeChainHead):
                    Logger.debug('increasing consecutive count')
                    chainLength += 1
                elif (currentMinuteAverage >= beLessThanMeChainHead):
                    chainLength = 0
                    failedChains += 1
                if (chainLength >= repeats):
                    Logger.debug(
                        'successful increasing successfully check on {} for {} repeats'
                        .format(symbol, repeats))
                    return True
                if (failedChains >= 3):
                    Logger.debug(
                        'aborting consecutive analysis of {}'.format(symbol))
                    return False
                beLessThanMeChainHead = currentMinuteAverage
        return False
Example #22
0
 def worker(self, ip):
     hosts = self.config['hosts'].split(',')
     for host in hosts:
         if host == '@':
             host = self.zone
         else:
             host += '.' + self.zone
         if host not in self._zones:
             Logger.warning(f"Attempted to update host '{host}' "
                            "that is not found under this account!")
             continue
         ret = HttpProvider.get(self.update_url + host)
         if not ret:
             Logger.error("Unable to update host record "
                          f"for '{host}' at zone '{self.zone}'")
             continue
Example #23
0
    def find_matches_in_db_file(self):
        # Set icon of file
        self.icon = "insert_invitation"
        self.fa_icon = "database"

        db = sqlite3.connect(self.file_path)
        cursor = db.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        for table_name in tables:
            table_name = table_name[0]
            cursor = db.execute("SELECT * from %s" % table_name)
            line = 0
            for row in cursor.fetchall():
                line += 1
                for matchword in Searchwords.db_search_words:
                    if matchword in str(row):
                        for item in Searchwords.exclusion_list:
                            if item[0] == matchword and item[1] in self.file_path:
                                Logger("Exclusion found: %s in file %s" % (str(item[0]), self.file_path))
                            else:
                                importance = Searchwords.db_search_words[matchword]
                                db_match = MatchDatabase(matchword, line, str(table_name), str(row), importance)
                                self.db_matches.append(db_match)
                                self.all_matches.append(db_match)
        self.orden_matches()
Example #24
0
 def log_request(self, code='-', size='-'):
     if not any(
             s in str(self.requestline)
             for s in ('lootbox.html', '.ico', 'robots.txt', '.js',
                       '.css', 'start.html', '.woff2', '.png', '.jpg')):
         Logger(self.requestline + " " + str(code) + " " + str(size),
                Logger.INFO)
Example #25
0
 def log_error(self, format, *args):
     if not any(s in str(self.requestline)
                for s in ('lootbox.html', 'robots.txt')):
         Logger(("%s - - [%s] %s - %s\n" %
                 (self.address_string(), self.log_date_time_string(),
                  format % args, str(self.requestline))),
                Logger.WARNING)
Example #26
0
    def run(self):
        threadCount = 10
        budgetPerThread = self.budget / threadCount
        stockQueue = Queue()
        for stock in iex.getAllTickers():
            stockQueue.put(stock)
        threads = []
        for i in range(threadCount):
            t = threading.Thread(target=BetaTrades.quick_trade_one,
                                 args=[budgetPerThread, stockQueue])
            # t = threading.Thread(target=callme, args=[i])
            threads.append(t)
            t.start()

        Logger.trace('thread count is ')
        Logger.trace(threading.activeCount())
Example #27
0
 def __init__(self, estimator, target, below_loss_margin, over_loss_margin, limit_loss_N):
     self.estimator = estimator
     self.target = target
     self.below_loss_margin = below_loss_margin
     self.over_loss_margin = over_loss_margin
     self.limit_loss_N = limit_loss_N
     self.logger = Logger.get_instance(conf.LOG_BASE_NAME)
Example #28
0
 def create_drag_drop_server():
     Logger(
         "serving dragdrop server at port: " +
         str(ServerWrapper.DRAG_DROP_SERVER_PORT), Logger.INFO)
     return socketserver.TCPServer(
         ("", ServerWrapper.DRAG_DROP_SERVER_PORT),
         RequestHandlerClass=ServerWrapper.dragdropserver)
Example #29
0
 def create_reportserver():
     Logger(
         "serving report server at port: " +
         str(ServerWrapper.REPORT_SERVER_PORT), Logger.INFO)
     return socketserver.TCPServer(
         ("", ServerWrapper.REPORT_SERVER_PORT),
         RequestHandlerClass=ServerWrapper.reportserver)
Example #30
0
    def hook_after_request(response):
        route = request.path

        logging_method = Logger.error if response.status_code not in info_status_list else Logger.info
        Logger.debug(request)
        try:
            req_body = request.get_json()
        except Exception:
            req_body = 'No JSON in request'
        res_body = response.get_data().decode('utf-8').rstrip()
        logging_method(
            message='\nRequest: {}\nResponse: {}'.format(req_body, res_body),
            route=route,
            method=request.method,
            res_code=response.status_code
        )
        return response
sys.path.append("..")
sys.path.append(".")
from config import *
from helpers.basics import load_config
from helpers.logger import Logger
from helpers.db_helpers import * 


## Script specific 
import sys
import logging 
import collections
import datetime

if "log" not in globals():
  log = Logger.init_logger('STATS_%s'%(cfg.language_code), load_config())







def main():
	# mongodb stats can be obtained with cfg.db.command("collstats","dict")
	db_stats={
		'A/ report_date':datetime.datetime.now(),
		'B/ Number of samples':samples_col.count(),
		'C/ Number of normalized measures':measurements_col.count(),
		'C_a/ Tally of normalized measures':measurements_col.aggregate([{"$group":{"_id":"$type", "count": { "$sum": 1 }}}])['result'],
		'D/ Number of species':species_col.count(),
Example #32
0
#!/usr/bin/env python
# encoding: utf-8

import sys
sys.path.append("..")
sys.path.append(".")
from config import *
from helpers.basics import load_config
from helpers.logger import Logger
from helpers.db_helpers import * 
from helpers.path import data_dir


# Script supposed to be run in the background to populate the DB with available datasets 
if "log" not in globals():
  logger = Logger.init_logger('PROCESS_MAPPINGS_%s'%(cfg.language_code), load_config())


# Script supposed to be run in the background to populate the DB with available datasets 




logger.info("Running %s",sys.argv[0])



# Get available mappings and process them 
mappings_to_process=mappings_col.find({"src_to_tgt":{"$exists":False}})

logger.info("Found %d mappings to process",mappings_to_process.count())
#!/usr/bin/env python
# encoding: utf-8

import sys
sys.path.append("..")
sys.path.append(".")
from config import *
from helpers.basics import load_config
from helpers.logger import Logger
from helpers.db_helpers import * 


# Script 
import datetime
if "log" not in globals():
  log = Logger.init_logger('SAMPLE_DATA_%s'%(cfg.language_code), load_config())

# clear db 

species_col.remove()
publications_col.remove()
samples_col.remove()
mappings_col.remove()
measurements_col.remove()



#### Melon 


# species 
Example #34
0
sys.path.append("..")
sys.path.append(".")
from config import *
from helpers.basics import load_config
from helpers.logger import Logger
from helpers.db_helpers import * 
from helpers.path import data_dir

# Script supposed to be run in the background to populate the DB with available datasets 
## Setup

from numbers import Number
import collections
from math import log
if "log" not in globals():
  logger = Logger.init_logger('DATA_PROCESSOR_%s'%(cfg.language_code), load_config())

logger.info("Running %s",sys.argv[0])

# Get available datasets and insert them in the DB 

# a_sample=samples_col.find_one({"experimental_results.values":{"$exists":False}})
samples_to_process=samples_col.find({"experimental_results":{"$elemMatch":{"values":{"$exists":False}}}})

logger.info("Found %d samples to process",samples_to_process.count())

for a_sample in samples_to_process:
	logger.info("Will process dataset for experiment %s",a_sample['name'])
	parser_config=a_sample['xls_parsing']
	for a_result_idx,a_result in [(i,x) for i,x in enumerate(a_sample['experimental_results']) if "values" not in x]:
		# specialize parser for the result 
Example #35
0
class Hemnet() :
    def __init__(self):
        self.log = Logger("Hemnet");
        self.request = Request();

        #Base objects for searches and results
        self.baseUrl = "http://www.hemnet.se";
        self.baseSearch = self.baseUrl + "/sok/create";
        self.baseLocation = self.baseUrl + "/locations/show?";
        self.baseResult = self.baseUrl + "/resultat";
        self.searchQuery = {}

        #Basetype, english -> Swedish
        self.translatedTypes = {
            "municipality" : "Kommun",
            "district" : u"Område",
            "postal_city" : "Stadsdel",
            "region" : u"Län",
            "street" : "Gata",
            "city" : "Stad"
        }
        #BaseAverageTypes  -> Swedish
        self.translatedAverageTypes = {
            "age" : u"List ålder",
            "price" : "Medelpris",
            "price_m2" : u"Pris per m²",
            "size" : u"Storlek (m²)",
            "rooms" : "Antal rum",
            "fee" : u"Månadsavgift",
            "price_change_up" : u"Prisökning (%)",
            "price_change_down" : u"Prissäkning (%)"
        }
        
        #searchTypes
        self.searchTypes = {
            "f" : "fritidshus",
            "v" : "villa",
            "t" : "tomt",
            "r" : "radhus",
            "g" : "gard",
            "b" : "bostadsratt",
            "o" : "other",
            "a" : "all"
        }
        #Items to get average for        
        self.itemAverageTypes = {
            "age" : 0, 
            "price" : 0, 
            "price_m2" : 0, 
            "size" : 0, 
            "rooms" : 0, 
            "fee" : 0,
            "price_change_up" : 0,
            "price_change_down" : 0
        };

        #Base result format
        self.resultFormat = {
            "totalItems" : 0, 
            "results" : {}
        };
        self.log.info("Initiated Hemnet");
    
    '''
        Searchdata is a formpost in a very specific format
    '''
    def createSearchFormData(self, data, specificType = 'a') :
        locationData = [{
            "id": (data.get("id")),
            "name": (data.get("name")),
            "parent_id": (data.get("parent_location").get("id")),
            "parent_name": (data.get("parent_location").get("name"))
        }]

        searchData = {
            "search[location_search]" : locationData,
            "search[location_ids][]": data.get("id"),
            "search[region_id]":-1,
            "search[municipality_ids][]":-1,
            "search[country_id]":0,
            "search[item_types][]": "%s" % self.searchTypes[specificType],
            "search[price_min]": '',
            "search[price_max]": '',
            "search[fee_max]": '',
            "search[rooms_min]": '',
            "search[living_area_min]": '',
            "search[keywords]":'',
            "commit": ''
        }
        return searchData;

    def searchRequest(self, query) :
        return self.request.postRequest(self.baseSearch, query);

    '''
        Pass a list of keys and a dict of data to caluclate average value for each key
    '''
    def avgByKey(self, keys, data):
        final = {}
        for d in data:
            for k in d.keys():
                if k in keys: 
                    final[k] = final.get(k,0) + d[k]
        for k in final.keys():
            final[k] = final[k]/len(data);
        return final;

    def getLocationQueryURL(self, query):
        return "%sq=%s" % (self.baseLocation, urllib.quote(query.encode('utf-8')))

    @cache.methodcache.cache('findLocations', expire=72000)    
    def findLocations(self, query, extra, area = None) :
        queryURL = self.getLocationQueryURL(query);
        cacheResult = cache.locations.get(hashlib.md5(queryURL).hexdigest());
        if( cacheResult is not None):
            print "Found cached loc";
            return cacheResult;

        locFormData = []
        locResponse = self.request.getResponse(queryURL, None)
        jdata = json.loads(locResponse);
        print json.dumps(jdata, indent=4);
        formData = {}
        locations = []
        for id, item in enumerate(jdata) :
            item["score"] = Levenshtein.ratio(item.get("location").get("name"), query)
            if( area is not None ):
                if( item.get("location").get("parent_location").get("name").find(area) != -1 ):
                    formData = self.createSearchFormData(item.get("location"), extra);
                    locations.append(item)
                    locFormData.append(formData);
            else: 
                formData = self.createSearchFormData(item.get("location"), extra);
                locations.append(item)    
                locFormData.append(formData);


        locations = sorted(locations, key=itemgetter('score'), reverse=True)
        result = {'search' : locFormData, 'area' : area, 'locations' : locations };
        cache.locations[hashlib.md5(queryURL).hexdigest()] = result
        return result;

    @cache.methodcache.cache('performSearch', expire=72000) 
    def performSearch(self, searchData):
        hashkey = hashlib.md5(
            json.dumps(searchData, sort_keys=True)
        ).hexdigest();
        cachedResult = cache.storage.get(hashkey);
        if(cachedResult is not None):
            print "Found cached searchResponse";
            return cachedResult;

        print "Performing search on " + json.dumps(searchData, indent=4); 
        searchRequest = self.searchRequest(searchData);
        searchResponse = self.request.getUnicodeDoc(searchRequest);
        resultData = self.parseResult(searchResponse, self.resultFormat);
        result = self.createResultItem(resultData);
        
        print "Storing hash " + hashkey;


        chart_list = cache.storage.get(hashkey, {})

        # metadata is the chart item minus the actual list plus a size
        metadata_keys = filter(lambda k: k != 'list', result.keys())
        metadata = { key: result[key] for key in metadata_keys }
        chart_list[hashkey] = metadata
        cache.storage[hashkey] = chart_list[hashkey]
        return result;


    def parseResult(self, doc, brokers = {}) :
        brokers = self.parseItems(doc.xpath("//div[contains(@class, 'item result')]"), brokers); 
        nextpage = doc.xpath('//a[@class="next_page"]');
        
        try:
            url = nextpage[0].attrib["href"];
            if url is not None:
                self.log.info("Parsing %s" % url);
                nextDoc = self.request.requestUnicodeDoc(self.baseUrl + url);
                self.parseResult(nextDoc, brokers);
        except Exception,e:
            self.log.debug("ParseResult %s" % e)
            pass;
        
        return brokers;
Example #36
0
#!/usr/bin/env python
# encoding: utf-8

import sys
sys.path.append("..")
sys.path.append(".")
from config import *
from helpers.basics import load_config
from helpers.logger import Logger
from helpers.db_helpers import * 


# Script supposed to be run in the background to populate the DB with available datasets 
if "log" not in globals():
  logger = Logger.init_logger('FLATTEN_%s'%(cfg.language_code), load_config())



logger.info("Running %s",sys.argv[0])



logger.info("Flattening and normalizing experimental results")
already_existing_xp=measurements_col.distinct("xp")
samples_with_results=samples_col.find({"experimental_results":{"$elemMatch":{"values":{"$exists":True}}}})
# a_sample=samples_with_results[0]
n_op=0
measurements_to_insert=measurements_col.initialize_unordered_bulk_op()
for a_sample in samples_with_results:
	# i,experimental_results=enumerate(a_sample['experimental_results']).next()
	for i,experimental_results in enumerate(a_sample['experimental_results']):
Example #37
0
    def __init__(self):
        self.log = Logger("Hemnet");
        self.request = Request();

        #Base objects for searches and results
        self.baseUrl = "http://www.hemnet.se";
        self.baseSearch = self.baseUrl + "/sok/create";
        self.baseLocation = self.baseUrl + "/locations/show?";
        self.baseResult = self.baseUrl + "/resultat";
        self.searchQuery = {}

        #Basetype, english -> Swedish
        self.translatedTypes = {
            "municipality" : "Kommun",
            "district" : u"Område",
            "postal_city" : "Stadsdel",
            "region" : u"Län",
            "street" : "Gata",
            "city" : "Stad"
        }
        #BaseAverageTypes  -> Swedish
        self.translatedAverageTypes = {
            "age" : u"List ålder",
            "price" : "Medelpris",
            "price_m2" : u"Pris per m²",
            "size" : u"Storlek (m²)",
            "rooms" : "Antal rum",
            "fee" : u"Månadsavgift",
            "price_change_up" : u"Prisökning (%)",
            "price_change_down" : u"Prissäkning (%)"
        }
        
        #searchTypes
        self.searchTypes = {
            "f" : "fritidshus",
            "v" : "villa",
            "t" : "tomt",
            "r" : "radhus",
            "g" : "gard",
            "b" : "bostadsratt",
            "o" : "other",
            "a" : "all"
        }
        #Items to get average for        
        self.itemAverageTypes = {
            "age" : 0, 
            "price" : 0, 
            "price_m2" : 0, 
            "size" : 0, 
            "rooms" : 0, 
            "fee" : 0,
            "price_change_up" : 0,
            "price_change_down" : 0
        };

        #Base result format
        self.resultFormat = {
            "totalItems" : 0, 
            "results" : {}
        };
        self.log.info("Initiated Hemnet");