コード例 #1
0
ファイル: parser.py プロジェクト: wyc192273/hawkeye
 def request(self, keyword_url):
     if not self.proxies:
         logging.error('No proxy is available')
         return None
     referer = self.referer
     proxy = random.choice(self.proxies)
     user_agent = random.choice(self.user_agents)
     proxies = {self.protocol: '{}://{}'.format(self.protocol, proxy)}
     headers = {'User-Agent': user_agent, 'referer': referer}
     keyword = keyword_url[0]
     url = keyword_url[1]
     timeout = 10
     try:
         req = requests.get(url,
                            proxies=proxies,
                            headers=headers,
                            timeout=timeout)
     except Exception, e:
         # print e
         # remove useless proxy
         try:
             self.proxies.remove(proxy)
             logging.INFO('Remove userless proxy {}'.format(proxy))
         except:
             pass
         return self.request(keyword_url)
コード例 #2
0
def run():
    try:
        content = load_content()
        data = parse_content(content)
        publish(data)
    except Exception as e:
        logging.error(str(e))
コード例 #3
0
def write_to_mongo(df, database, collection, incremental_run):
    """
        This method writes the data into MongoDB.
        If incremental value is 1, then the data is appended to the database. Else, overwritten.

        Parameters:
        -----------
        df (Dataframe): The dataframe to be written to MongoDB.
        database (string): The database in which we are going to write the data.
        collection (string): The collection in which we are going to write the data.
        incremental_run (int): Determines if data is overwrritten or appended to the collection.
    """

    try:
        logging.info('Write to MongoDB in progress')
        write_mode = "overwrite"
        if incremental_run:
            write_mode = "append"
        df.write.format("mongo").mode(write_mode).option(
            "database", database).option("collection", collection).save()
        logging.info('Write to MongoDB completed successfully')

    except Exception as e:
        logging.error('Error in write_to_mongo() function: {0}'.format(e))
        raise e
コード例 #4
0
def get_states_from_graphml(filename: str):
    """
    creates state list
    :return:
    """
    try:
        data = xmltodict.parse(open(filename).read())
    except FileNotFoundError:
        logging.error('File %s does not exist' % filename)
        return list(), 0, 0
    # get nodes from file
    flat_nodes = gr.get_flat_nodes(data)
    state_nodes = [node for node in flat_nodes if
                   gr.is_node_a_state(node) or gr.is_node_a_choice(node) or gr.is_node_a_group(node)]
    state_nodes.sort(key=lambda st: len(st['id']))
    gr.update_qroup_nodes(state_nodes)
    state_nodes.sort(key=gr.coord_sort)
    coords = gr.get_minmax_coord(state_nodes)  # get min and max coord and height and widt of scheme
    # create states from nodes and add internal triggers to list of signals and all functions to function list
    qm_states, player_signal = qm.create_states_from_nodes(state_nodes, coords, [], [])
    # get edges for external triggers
    flat_edges = gr.get_flat_edges(data)
    try:
        start, start_node, start_action = gr.get_start_node_data(flat_nodes, flat_edges)
    except ValueError:
        logging.error('UML-diagram %s.graphml does not have start node' % filename)
        return list(), 0, 0
    # add external trigger and update list of signals with them
    _ = qm.update_states_with_edges(qm_states, flat_edges, start, player_signal, coords[0], coords[1])
    return qm_states, coords[0], coords[1]
コード例 #5
0
ファイル: httpclient.py プロジェクト: navyaijm2017/website
def httpResponse(url, method='POST', data=None, headers=None):
    response = None
    if headers == None:
        headers = {
            "Origin": "https://www.baidu.com",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) " +\
                "AppleWebKit/537.36 (KHTML, like Gecko) " +\
                "Chrome/34.0.1847.116 Safari/537.36",
            "Content-Type": "application/x-www-form-urlencoded",
            "Referer": "https://www.baidu.com/login?forward=http://localhost",
            "Accept-Encoding": "gzip,deflate,sdch",
            "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4",
            "Cookie": "sessionid=ubwzabfvvyy0ft4y4nk5qlduv7nswrim",
        }
    try:
        request = urllib2.Request(url,
                                  data,
                                  headers=headers,
                                  origin_req_host=None,
                                  unverifiable=False)
        #request = urllib2.Request(url, data, origin_req_host=None, unverifiable=False)
        request.get_method = lambda: method
        try:
            response = urllib2.urlopen(request)  #python <= 2.6
        except AttributeError, e:
            logging.debug(u'URLopen AttributeError: {msg}'.format(msg=e))
            response = urllib2.urlopen(
                request, context=ssl._create_unverified_context())  #python 2.7
    except urllib2.HTTPError, e:
        logging.error(u'HTTP 服务器({url})无法完成请求.错误码:{code}'.format(url=url,
                                                                 code=e.code),
                      exc_info=True)
コード例 #6
0
def read_from_cassandra(incremental_run, keyspace, table):
    """
        This method reads the data from cassandra based on the incremental_run
        value.If incremental value is 1, then the data read is for the running
        week and fetches the current day data from it. Else, fetches all the
        data from cassandra.

        Parameters:
        -----------
        incremental_run (int): Determines how data is to be read.
        keyspace (string): Cassandra keyspace from which data is to be read.
        table (string): Cassandra table inside the keyspace from which data is
                        to be read.

        Returns
        --------
        df (Dataframe): The dataframe obtained after reading from Cassandra
    """

    try:
        logging.info('Read from_cassandra in progress')
        column_names = ["event_time", "user_id"]
        if incremental_run:
            today = date.today()
            next_day = today + timedelta(days=1)
            today_starting_timestamp = datetime(today.year, today.month, \
                                       today.day)

            next_day_starting_timestamp = datetime(next_day.year, next_day.month,\
                                          next_day.day)

            #Set condition to fetch the current day data by pushing down the
            #predicate to reduce the number of entries retrived from the database.
            incremental_condition = \
                (F.col("year") == year) & (F.col("week") == week_num) & \
                (F.col("event_time") >= today_starting_timestamp) & \
                (F.col("event_time") < next_day_starting_timestamp)

            df=spark.read.format("org.apache.spark.sql.cassandra")\
                      .option("spark.cassandra.connection.port", "9042")\
                      .option("keyspace", keyspace)\
                      .option("table", table)\
                      .load()\
                      .select(column_names)\
                      .where(incremental_condition)
        else:
            df=spark.read.format("org.apache.spark.sql.cassandra")\
                      .option("spark.cassandra.connection.port", "9042")\
                      .option("keyspace", keyspace)\
                      .option("table", table)\
                      .load()\
                      .select(column_names)

        logging.info('Dataframe loaded successfully')
        return df

    except Exception as e:
        logging.error('Error in read_from_cassandra() function: {0}'.format(e))
        raise e
コード例 #7
0
def log_error_and_upload_manifests_to_s3(error, elasticsearch_docs):
    logging.error("Exception caught while sending manifests to elasticsearch")
    logging.exception(error)
    logging.info("Uploading manifests to s3 fallback bucket")
    s3_client.put_object(Bucket=MANIFEST_FALLBACK_BUCKET,
                         Key=os.path.join(f"s3-batch/manifests/{datetime.utcnow().strftime('%Y-%m-%d')}.json.gz"),
                         Body=gzip.compress(json.dumps(elasticsearch_docs).encode("utf-8")),
                         ACL="private")
コード例 #8
0
def main(filenames: Union[List[str], str]):

    qm_model, qm_package = cr.prepare_qm()
    player_signal = list()
    event_fields = dict()
    ctor_fields = dict()
    ctor_code = ""
    cppcode = ""
    hcode = ""
    if not isinstance(filenames, list):
        filenames = [filenames]
    modelnames: List[str] = list()
    for filename in filenames:
        try:
            data = xmltodict.parse(open(filename).read())
            modelname = os.path.basename(filename)
            modelname = modelname.split('.')[0]
            modelname = modelname[0].lower() + modelname[1:]
            modelnames.append(modelname)
        except FileNotFoundError:
            logging.error('File %s does not exist' % filename)
            continue
        # get nodes from file
        flat_nodes = gr.get_flat_nodes(data)
        state_nodes = [node for node in flat_nodes if
                       gr.is_node_a_state(node) or gr.is_node_a_choice(node) or gr.is_node_a_group(node)]
        state_nodes.sort(key=lambda st: len(st['id']))
        gr.update_qroup_nodes(state_nodes)
        state_nodes.sort(key=gr.coord_sort)

        coords = gr.get_minmax_coord(state_nodes)      # get min and max coord and height and widt of scheme
        # create states from nodes and add internal triggers to list of signals and all functions to function list
        functions: List[str] = list()
        qm_states, player_signal = qm.create_states_from_nodes(state_nodes, coords, player_signal, functions)
        # get edges for external triggers
        flat_edges = gr.get_flat_edges(data)
        try:
            start, start_node, start_action = gr.get_start_node_data(flat_nodes, flat_edges)
        except ValueError:
            logging.error('UML-diagram %s.graphml does not have start node' % filename)
            continue
        # add external trigger and update list of signals with them
        player_signal = qm.update_states_with_edges(qm_states, flat_edges, start, player_signal, coords[0], coords[1])
        # get notes
        notes = [node for node in flat_nodes if gr.is_node_a_note(node)]
        # create qm data
        event_fields, hcode, cppcode, ctor_code, ctor_fields = cr.create_qm(qm_package, modelname, start_node,
                                                                            start_action, notes, qm_states,
                                                                            coords)
    # create file with final code
    try:
        cr.finish_qm(qm_model, qm_package, os.path.splitext(filenames[0])[0], modelnames, player_signal, event_fields, hcode, cppcode,
                     ctor_code, ctor_fields)
    except PermissionError:
        logging.fatal("File already exists and is locked")
    service_files.create_files(os.path.dirname(filenames[0]), player_signal, modelname, functions)
コード例 #9
0
ファイル: es_util.py プロジェクト: dovanduy/choinho
    def force_bulk(self, bulk=False):
        if bulk or len(self.bulker) >= self.batchSize == 0:
            success, errors = helpers.bulk(
                self.esConn, self.bulker.pop_all(),
                chunk_size=self.batchSize)  # @UnusedVariable
            if errors:
                logging.error("Force bulk: there are some errors %s", errors)
                return False

        return True
コード例 #10
0
def read_from_cassandra(incremental_run, keyspace, table):
    """
        This method reads the data from cassandra based on the incremental_run value.
        If incremental value is 1, then the data read is for the running week and fetches
        the current day data from it. Else, fetches all the data from cassandra.

        Parameters:
        -----------
        incremental_run (int): Determines how data is to be read. 
        keyspace (string): Cassandra keyspace from which data is to be read.
        table (string): Cassandra table inside the keyspace from which data is to be read.

        Returns
        --------
        df (Dataframe): The dataframe obtained after reading from Cassandra 
    """

    try:
        logging.info('Read from_cassandra in progress')
        column_names = ["event_time", "user_id"]
        if incremental_run:
            #Get current week number
            today_date = datetime.date.today()
            year, week_num, day_of_week = today_date.isocalendar()

            #Set condition to fetch the current week data by pushing down the predicate to reduce the number of entries
            #retrived from the database.
            incremental_condition = (F.col("year") == year) & (F.col("week")
                                                               == week_num)

            #Get the current week data from cassandra
            df=spark.read.format("org.apache.spark.sql.cassandra")\
                      .option("keyspace", keyspace)\
                      .option("table", table)\
                      .load()\
                      .select(column_names)\
                      .where(incremental_condition)

            #Filter and fetch the current day's data
            df = df.filter(day(df.event_time) == today_date.day)
        else:
            #Reds the entire data from the cassandra
            df=spark.read.format("org.apache.spark.sql.cassandra")\
                      .option("spark.cassandra.connection.port", "9042").option("keyspace", keyspace)\
                      .option("table", table)\
                      .load()\
                      .select(column_names)

        logging.info('Dataframe loaded successfully')
        return df

    except Exception as e:
        logging.error('Error in read_from_cassandra() function: {0}'.format(e))
        raise e
コード例 #11
0
ファイル: es_util.py プロジェクト: dovanduy/choinho
 def upsert_batch(self,
                  indexName,
                  indexType,
                  docs,
                  batchSize=1000,
                  idField=None):
     actions = self._buildIndexActions(indexName, indexType, docs, idField)
     success, errors = helpers.bulk(self.esConn,
                                    actions,
                                    chunk_size=batchSize)  # @UnusedVariable
     if errors:
         logging.error("Upsert batch: there are some errors %s", errors)
コード例 #12
0
def get_opened_files(log_errors=True):
    for pid in psutil.pids():
        try:
            yield (file[0] for file in psutil.Process(pid).open_files())
        except psutil.AccessDenied as e:
            if log_errors:
                logging.error("Access denied while getting process opened files")
                logging.exception(e)
        except psutil.NoSuchProcess:
            logging.debug("Process no longer exists")
        except Exception as e:
            if log_errors:
                logging.exception(e)
コード例 #13
0
 def get(self, path: str) -> str:
     """
     Downloads a file and return its contents as a string. Objects that are larger than available
     memory cannot be loaded via `get`.
     """
     parsed_path = parse_path(path)
     try:
         data = self.client.get_object(Bucket=parsed_path["bucket"],
                                       Key=parsed_path["key"])["Body"]
         return data.read().decode("unicode-escape")
     except ClientError as err:
         self.error = err
         if err.response["Error"]["Code"] == "NoSuchKey":
             log.error("S3 file %s does not exist", path)
コード例 #14
0
def process_request(command):
    pidlist = []
    for proc in process_iter():
        if re.match(command, proc.name()):
            pidlist.append(proc.pid)

    for pid in pidlist:
        process = Process(pid)
        try:
            ios = process.io_counters()
            for iotype in ios._fields:
                IO_PROCESS.labels(io_type=iotype, pid=pid,
                                  cmd=process.name()).set(getattr(ios, iotype))
        except AccessDenied:
            logging.error("unable to access to PID %s stats" % pid)
    return IO_PROCESS
コード例 #15
0
ファイル: httpclient.py プロジェクト: navyaijm2017/website
def multiHttpResponse(url, method='POST', data=None, count=5, sleep=5):
    '''
    HTTP GET/POST 数据,失败重试(默认重试5次,每次延时5秒)
    '''
    num = 0
    while count:
        num += 1
        logging.info(u'{method}第{num}次数据.'.format(method=method, num=num))
        code, result = httpResponse(url, method, data)
        if code == 200 or result != 'error':
            return code, result
        else:
            time.sleep(sleep)
            count -= 1
    logging.error(u'尝试{num}次,仍然失败退出.'.format(num=num))
    return False, 'error'
コード例 #16
0
def main(filenames):


    #filenames = ['ka-tet', 'prioritizer2', 'location', 'emotion', 'dogan_ligt', 'reason_handler', 'dogan1',
     #            'lightsaber']
    #filenames = ["ka_tet_counter", "ka_tet", "character"]

    qm_model, qm_package = cr.prepare_qm()
    player_signal=[]

    for filename in filenames:

        try:
            data = xmltodict.parse(open(filename + '.graphml').read())
        except FileNotFoundError:
            logging.error('File %s.graphml does not exist' % filename)
            continue

        #get nodes from file
        flat_nodes = gr.get_flat_nodes(data)
        state_nodes = [node for node in flat_nodes if
                       gr.is_node_a_state(node) or gr.is_node_a_choice(node) or gr.is_node_a_group(node)]
        gr.update_qroup_nodes(state_nodes)
        state_nodes.sort(key=gr.coord_sort)

        coords = gr.get_minmax_coord(state_nodes)      #get min and max coord and height and widt of scheme
        #create states from nodes and add internal triggers to list of signals
        qm_states, player_signal = qm.create_states_from_nodes(state_nodes, coords, player_signal)
        #get edges for external triggers
        flat_edges = gr.get_flat_edges(data)
        try:
            start, start_node, start_action = gr.get_start_node_data(flat_nodes, flat_edges)
        except ValueError:
            logging.error('UML-diagram %s.graphml does not have start node' % filename)
            continue

        #add external trigger and update list of signals with them
        player_signal = qm.update_states_with_edges(qm_states, flat_edges, start, player_signal, coords[0], coords[1])
        #get notes
        notes = [node for node in flat_nodes if gr.is_node_a_note(node)]
        #create qm data
        event_fields = cr.create_qm(qm_package, filename, start_node, start_action, notes, qm_states, coords, player_signal)
    #create file with final code
    try:
        cr.finish_qm(qm_model, qm_package, filenames, player_signal, event_fields)
    except PermissionError:
        logging.fatal("File already exists and is locked")
コード例 #17
0
def get_user_count_by_day(cached_df):
    """
        This method finds out the daily user count using the platform.

        Parameters:
        -----------
        df (Dataframe): The dataframe obtained after reading from Cassandra

        Returns
        --------
        result (Dataframe): The dataframe with the daily user count.
    """

    try:
        logging.info('Getting user count by hour in progress')
        cached_df.createOrReplaceTempView('cached_df')
        user_count_per_day_df = \
                                spark.sql('''
                                with grouped_user_by_day AS (
                                SELECT
                                    user_id, DATE(event_time) as date1
                                FROM
                                    df
                                GROUP BY
                                    user_id,DATE(event_time)
                                )
                                SELECT
                                    date1, COUNT(1)
                                FROM
                                    grouped_user_by_day
                                GROUP BY
                                    date1
                                '''
                                )
        ##    result = result.withColumn("day",result["day"].cast(StringType()))
        ##    result = result.groupBy("year","month").agg(
        ##        F.map_from_entries(\
        ##        F.collect_list(\
        ##        F.struct("day", "count"))).alias("user_count"))
        ##    return result
        logging.info('Got User count by hour successfully')
        return user_count_per_day_df

    except Exception as e:
        logging.error(
            'Error in get_user_count_by_day() function: {0}'.format(e))
        raise e
コード例 #18
0
def restore_from_dir(openshift_client, directory, resources):
    for resource_kind in os.listdir(directory):
        print(resources)
        if resource_kind in resources:
            resource_kind_dir = directory + '/' + resource_kind
            for single_resource in os.listdir(resource_kind_dir):
                full_path = '{0}/{1}'.format(resource_kind, single_resource)
                log.info('Restoring {}'.format(full_path))
                try:
                    with open(resource_kind_dir + '/' + single_resource,
                              'r') as f:
                        resource_yaml = yaml.load(f)
                    openshift_client.create_resource(resource_kind,
                                                     resource_yaml,
                                                     args.restore_project_name)
                except ApiException as err:
                    log.error('Unable to restore {0}'.format(full_path))
                    log.debug(err)
コード例 #19
0
def main(filenames: Union[List[str], str]):

    player_signal = list()
    if not isinstance(filenames, list):
        filenames = [filenames]
    modelnames: List[str] = list()
    for filename in filenames:
        try:
            data = xmltodict.parse(open(filename).read())
            modelname = os.path.basename(filename)
            modelname = modelname.split('.')[0]
            modelname = modelname[0].lower() + modelname[1:]
            modelnames.append(modelname)
        except FileNotFoundError:
            logging.error('File %s does not exist' % filename)
            continue
        # get nodes from file
        flat_nodes = gr.get_flat_nodes(data)
        state_nodes = [node for node in flat_nodes if
                       gr.is_node_a_state(node) or gr.is_node_a_choice(node) or gr.is_node_a_group(node)]
        state_nodes.sort(key=lambda st: len(st['id']))
        gr.update_qroup_nodes(state_nodes)
        state_nodes.sort(key=gr.coord_sort)

        coords = gr.get_minmax_coord(state_nodes)      # get min and max coord and height and widt of scheme
        # create states from nodes and add internal triggers to list of signals and all functions to function list
        functions: List[str] = list()
        qm_states, player_signal = qm.create_states_from_nodes(state_nodes, coords, player_signal, functions)
        # get edges for external triggers
        flat_edges = gr.get_flat_edges(data)
        try:
            start, start_node, start_action = gr.get_start_node_data(flat_nodes, flat_edges)
        except ValueError:
            logging.error('UML-diagram %s.graphml does not have start node' % filename)
            continue
        # add external trigger and update list of signals with them
        player_signal = qm.update_states_with_edges(qm_states, flat_edges, start, player_signal, coords[0], coords[1])
        # get notes
        notes = [node for node in flat_nodes if gr.is_node_a_note(node)]
        # TODO(aeremin) Extract to separate file.
        CppFileWriter(modelname, start_node, start_action, qm_states, notes, player_signal).write_to_file(os.path.dirname(filename))

    service_files.create_files(os.path.dirname(filenames[0]), player_signal, modelname, functions)
コード例 #20
0
def get_user_count_by_hour(df):
    """
        This method finds out the hourly user count using the platform.

        Parameters:
        -----------
        df (Dataframe): The dataframe obtained after reading from Cassandra 

        Returns
        --------
        result (Dataframe): The dataframe with the daily user count.
    """

    try:
        logging.info('Getting user count by hour in progress')
        df.createOrReplaceTempView('df')
        result = \
            spark.sql('''

            with cte1 AS (
            SELECT DATE(event_time) as date1, HOUR(event_time) as hour,
                1 as count
            FROM df
                GROUP BY
            user_id,DATE(event_time), HOUR(event_time)
            )
            SELECT to_timestamp(CONCAT(cast(date1 as string),"/",cast(hour as string),":00:00"), "yyyy-MM-dd/HH:mm:ss") as date,
            YEAR(date1) as year, MONTH(date1) as month, DAY(date1) as day, hour, SUM(count) as count FROM cte1 GROUP BY date1,hour
        ''')
        ##    result = result.withColumn("day",result["day"].cast(StringType()))
        ##    result = result.groupBy("year","month").agg(
        ##        F.map_from_entries(\
        ##        F.collect_list(\
        ##        F.struct("day", "count"))).alias("user_count"))
        ##    return result
        logging.info('Got User count by hour successfully')
        return result

    except Exception as e:
        logging.error(
            'Error in get_user_count_by_hour() function: {0}'.format(e))
        raise e
コード例 #21
0
 def list(self, path: str) -> List[str]:
     """
     Lists all objects at the given S3 path and returns them in a list. Each elemet of the
     returned list is a fully-qualified S3 path (i.e. it could be passed to other s3_manager
     functions).
     """
     objects = []
     parsed_path = parse_path(path)
     try:
         keys = self.client.list_objects_v2(
             Bucket=parsed_path["bucket"],
             Prefix=parsed_path["key"])["Contents"]
         objects.extend([
             "s3://" + parsed_path["bucket"] + "/" + k["Key"] for k in keys
         ])
     except ClientError as err:
         self.error = err
         if err.response["Error"]["Code"] == "NoSuchKey":
             log.error("S3 file %s does not exist", path)
     return objects
コード例 #22
0
def get_weather():
    url = "https://devapi.heweather.net/v7/weather/24h"
    key = global_config.get('config', 'key')
    location = global_config.get('config', 'location')

    # 失败重试5次
    count = 0
    while count < 5:
        try:
            count += 1
            result = requests.get(url=url,
                                  params={
                                      'location': location,
                                      'key': key
                                  },
                                  timeout=(3, 1))
            if result:
                return get_result(result)
        except Exception as e:
            logging.error(f'retry,{count}, {e}')
            time.sleep(1)
コード例 #23
0
ファイル: graphml.py プロジェクト: notiel/graphml-to-xml
def main(filenames):
    for filename in filenames:
        try:
            data = xmltodict.parse(open(filename + '.graphml').read())
        except FileNotFoundError:
            logging.error('File %s.graphml does not exist' % filename)
            continue
        flat_nodes = gr.get_flat_nodes(data)
        state_nodes = [
            node for node in flat_nodes if gr.is_node_a_state(node)
            or gr.is_node_a_choice(node) or gr.is_node_a_group(node)
        ]
        gr.update_qroup_nodes(state_nodes)
        states = makexml.create_states_from_nodes(state_nodes)
        flat_edges = gr.get_flat_edges(data)
        try:
            start, start_action = gr.get_start_node_action(
                flat_nodes, flat_edges)
        except ValueError:
            logging.error('UML-diagram %s.graphml does not have start node' %
                          filename)
            continue
        makexml.update_states_with_edges(states, flat_edges, start)
        makexml.createxml(filename, states, start_action)
コード例 #24
0
def replace_image(job_id,
                  file_name,
                  html_string,
                  bucket_name,
                  bucket_folder='content/'):
    # parse html and put it in a variable
    images = set(re.findall("src='([^']+)'", html_string))

    logging.info("[IMG] Start analyzing html for job %s in file %s", job_id,
                 file_name)

    # run loop for all images in the html
    # Upload images in our bucket and replace image src
    for image in images:
        image_src = image.strip()

        # if image was not uploaded to hackapad s3 ignore
        if not image_src.startswith(
                'https://hackpad-attachments.s3.amazonaws.com/'):
            continue

        logging.info("[IMG] Processing image %s" % image_src)

        #get image mime_type
        mime_type_info = mimetypes.guess_type(image_src)
        mime_type = mime_type_info[0] if mime_type_info[0] else 'image/jpeg'

        # construct expire and cache_control headers
        days = 100
        cache_control = 'max-age= %d' % (60 * 60 * 24 * days)
        expires = datetime.utcnow() + timedelta(days=days)
        expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT")

        try:
            logging.info("[IMG] First try for image %s", image_src)
            # get image name
            image_url_parts = image_src.split('/')
            image_name = image_url_parts

            # read image url
            image_src_parsed = urllib.parse.urlparse(image_src)
            image_name_encoded = urllib.parse.quote(image_src_parsed.path)
            file = io.BytesIO(
                urllib.request.urlopen(
                    urllib.parse.urljoin(image_src,
                                         image_name_encoded)).read())
            img = Image.open(file, mode='r')
        except urllib.error.HTTPError as error:
            logging.warning(
                "[IMG] First try block resulted in urllib.error.HTTPError: %s"
                % error)
            try:
                logging.info("[IMG] retry for image %s", image_src)
                file = io.BytesIO(urllib.request.urlopen(image_src).read())
                img = Image.open(file, mode='r')
            except urllib.error.HTTPError as error:
                logging.error("[IMG] %s", error.read())
                continue
            except UnicodeEncodeError:
                logging.error("[IMG] UnicodeEncodeError for image %s",
                              image_src)
                continue

        # get the image extension
        image_parts = image_src_parsed.path.split('.')
        image_extension = 'JPEG' if image_parts[-1].upper(
        ) == 'JPG' else image_parts[-1]
        # hack for weird image URLs
        if len(image_extension) > 4:
            image_extension = 'png'

        # stream file in binary mode
        imgByteArr = io.BytesIO()
        img.save(imgByteArr, format=image_extension.upper())
        imgByteArr = imgByteArr.getvalue()

        # upload image to our bucket
        # First check if it already exists
        exists = False
        try:
            s3.Object(bucket_name, bucket_folder + image_name[-1]).load()
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                exists = False
        else:
            exists = True
        if exists:
            logging.info("[IMG] Skipping upload: %s already exists" %
                         image_src)
        else:
            logging.info("[IMG] Uploading %s" % image_src)
            s3.Bucket(bucket_name).put_object(Key=bucket_folder +
                                              image_name[-1],
                                              Body=imgByteArr,
                                              ACL='public-read',
                                              ContentType=mime_type,
                                              CacheControl=cache_control,
                                              Expires=expires)

        logging.info("[IMG] Replace %s with %s" %
                     (image_src, 'https://s3-eu-west-1.amazonaws.com/' +
                      bucket_name + '/' + bucket_folder + image_name[-1]))
        # replace the src of the image with the new uploaded location
        html_string = html_string.replace(
            image_src, 'https://s3-eu-west-1.amazonaws.com/' + bucket_name +
            '/' + bucket_folder + image_name[-1])

        logging.info("[IMG] Replaced with %s", image_src)

    logging.info("[IMG] Finished analyzing html for job %s in file %s", job_id,
                 file_name)

    return html_string
コード例 #25
0
            project_id = json.load(f)['project_id']
        bucket = storage_client.bucket('tyeoh-streetcred',
                                       user_project=project_id)

        metadata_dir = os.path.join(app_dir, 'metadata')

        if not os.path.isdir(metadata_dir):
            os.makedirs(metadata_dir)
            logging.info('Created %s' % metadata_dir)

        for api in apis:
            logging.info('Loading data for %s' % api)
            request_date = datetime.now(
                tz=pytz.timezone('Singapore')).strftime('%Y%m%d')
            df = generate_table(api)
            dest_path = os.path.join(
                metadata_dir, '%s_%s_metadata.csv.xz' % (request_date, api))
            df.to_csv(dest_path, index=False, header=True, compression='xz')
            logging.info('Saved data to %s' % dest_path)

            upload_blob(
                bucket, dest_path,
                '%s_metadata/%s_%s_metadata.csv.xz' % (api, request_date, api))
            os.remove(dest_path)
            logging.info('Deleted %s' % dest_path)

    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
        raise
    else:
        logging.info('Script complete')
コード例 #26
0
ファイル: es_util.py プロジェクト: dovanduy/choinho
 def delete_batch(self, indexName, indexType, docids):
     actions = self._buildDeleteActions(indexName, indexType, docids)
     success, errors = helpers.bulk(self.esConn, actions)  # @UnusedVariable
     if errors:
         logging.error("Delete batch: there are some errors %s", errors)
コード例 #27
0
        logger.setLevel(logging.INFO)

        #Parses the arugment provided from the command line.
        parser = argparse.ArgumentParser()
        parser.add_argument("--cass_keyspace", help="keyspace")
        parser.add_argument("--cass_table", help="table")
        parser.add_argument("--mongo_db", help="Mongo db")
        parser.add_argument("--mongo_collection", help="Mongo collection")
        parser.add_argument("--incremental_run",
                            help="Full table load or incremental run")

        args = parser.parse_args()
        if not (args.cass_keyspace and args.cass_table and args.mongo_db
                and args.mongo_collection and args.incremental_run):
            logging.error(
                "Command line arguments are missing. Possibly --cass_keyspace --cass_table --mongo_db --mongo_collection --incremental_run "
            )
            sys.exit()
        if args.incremental_run not in ['0', '1']:
            logging.error("Incremental run should be either 0 or 1")
            sys.exit()
        incremental_run = int(args.incremental_run)

        logging.info("Argument parsed successfully")

        #Spawn spark session
        spark = pyspark.sql.SparkSession.builder\
                    .appName('test-mongo')\
                    .master('local[*]')\
                    .getOrCreate()
        df = read_from_cassandra(incremental_run, args.cass_keyspace,
コード例 #28
0
ファイル: app.py プロジェクト: fusionbob/microdrop
    def __init__(self):
        args = parse_args()

        print 'Arguments: %s' % args

        self.name = "microdrop.app"
        # get the version number
        self.version = ""
        try:
            raise Exception
            version = subprocess.Popen(['git','describe'],
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       stdin=subprocess.PIPE).communicate()[0].rstrip()
            m = re.match('v(\d+)\.(\d+)-(\d+)', version)
            self.version = "%s.%s.%s" % (m.group(1), m.group(2), m.group(3))
            branch = subprocess.Popen(['git','rev-parse', '--abbrev-ref', 'HEAD'],
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       stdin=subprocess.PIPE).communicate()[0].rstrip()
            if branch.strip() != 'master':
                self.version += "-%s" % branch
        except:
            import pkg_resources

            version = pkg_resources.get_distribution('microdrop').version

            dev = ('dev' in version)

            self.version = re.sub('\.dev.*', '',
                                  re.sub('post', '', version))
            if dev:
                self.version += "-dev"

        self.realtime_mode = False
        self.running = False
        self.builder = gtk.Builder()
        self.signals = {}
        self.plugin_data = {}

        # these members are initialized by plugins
        self.experiment_log_controller = None
        self.config_controller = None
        self.dmf_device_controller = None
        self.protocol_controller = None
        self.main_window_controller = None

        # Enable custom logging handler
        logger.addHandler(CustomHandler())
        self.log_file_handler = None

        # config model
        try:
            self.config = Config(args.config)
        except IOError:
            logging.error('Could not read configuration file, `%s`.  Make sure'
                          ' it exists and is readable.', args.config)
            raise SystemExit(-1)

        # set the log level
        if self.name in self.config.data and ('log_level' in
                                              self.config.data[self.name]):
            self._set_log_level(self.config.data[self.name]['log_level'])
        logger.info('Microdrop version: %s', self.version)
        logger.info('Running in working directory: %s', os.getcwd())

        # Run post install hooks for freshly installed plugins.
        # It is necessary to delay the execution of these hooks here due to
        # Windows file locking preventing the deletion of files that are in use.
        post_install_queue_path = \
            path(self.config.data['plugins']['directory']) \
            .joinpath('post_install_queue.yml')
        if post_install_queue_path.isfile():
            post_install_queue = yaml.load(post_install_queue_path.bytes())
            post_install_queue = map(path, post_install_queue)

            logger.info('[App] processing post install hooks.')
            for p in post_install_queue:
                try:
                    info = get_plugin_info(p)
                    logger.info("  running post install hook for %s" %
                                info.plugin_name)
                    plugin_manager.post_install(p)
                finally:
                    post_install_queue.remove(p)
            post_install_queue_path.write_bytes(yaml.dump(post_install_queue))

        # Delete paths that were marked during the uninstallation of a plugin.
        # It is necessary to delay the deletion until here due to Windows file
        # locking preventing the deletion of files that are in use.
        deletions_path = path(self.config.data['plugins']['directory'])\
                .joinpath('requested_deletions.yml')
        if deletions_path.isfile():
            requested_deletions = yaml.load(deletions_path.bytes())
            requested_deletions = map(path, requested_deletions)

            logger.info('[App] processing requested deletions.')
            for p in requested_deletions:
                try:
                    if p != p.abspath():
                        logger.info('    (warning) ignoring path %s since it '\
                            'is not absolute' % p)
                        continue
                    if p.isdir():
                        info = get_plugin_info(p)
                        if info:
                            logger.info('  deleting %s' % p)
                            cwd = os.getcwd()
                            os.chdir(p.parent)
                            try:
                                path(p.name).rmtree() #ignore_errors=True)
                            except Exception, why:
                                logger.warning('Error deleting path %s (%s)'\
                                        % (p, why))
                                raise
                            os.chdir(cwd)
                            requested_deletions.remove(p)
                    else: # if the directory doesn't exist, remove it from the
                          # list
                        requested_deletions.remove(p)
                except (AssertionError,):
                    logger.info('  NOT deleting %s' % (p))
                    continue
コード例 #29
0
def catch_exception(session, e):
    logging.error("DbConnectorRetrying error. Catch exception with traceback")
    logging.exception(e)
    session.rollback()
コード例 #30
0
def parse_env():
    args = SimpleNamespace(**{})
    try:
        args.WORKING_DIR = os.environ['WORKING_DIR']
    except KeyError:
        args.WORKING_DIR = '.'

    args.BACKUP_GIT_WORKING_DIR = args.WORKING_DIR + '/backup'
    args.SECRET_GIT_WORKING_DIR = args.WORKING_DIR + '/secret'
    args.temp_ssh_file = None
    args.temp_cert_file = None

    try:
        args.GIT_SSH_PRIVATE_KEY_LOC = os.environ['GIT_SSH_PRIVATE_KEY_LOC']
    except KeyError:
        try:
            private_key = os.environ['GIT_SSH_PRIVATE_KEY']
            args.GIT_SSH_PRIVATE_KEY_LOC = args.temp_ssh_file = args.WORKING_DIR + '/ssh_key'
            f = open(args.GIT_SSH_PRIVATE_KEY_LOC, 'w')
            f.write(private_key)
            f.close()
            os.chmod(args.GIT_SSH_PRIVATE_KEY_LOC, 0o600)
        except KeyError:
            log.error(
                'Either GIT_SSH_PRIVATE_KEY_LOC or GIT_SSH_PRIVATEY_KEY environment variable must be set.'
            )
            exit(1)

    try:
        args.LOG_LEVEL = os.environ['LOG_LEVEL']
    except KeyError:
        args.LOG_LEVEL = 'WARNING'

    try:
        args.BACKUP_GIT_REPO = os.environ['BACKUP_GIT_REPO']
    except KeyError:
        log.error('BACKUP_GIT_REPO environment variable must be set.')
        exit(1)

    try:
        args.SECRET_GIT_REPO = os.environ['SECRET_GIT_REPO']
    except KeyError:
        log.error('SECRET_GIT_REPO environment variable must be set.')
        exit(1)

    try:
        args.KUBERNETES_SERVICE_HOST = os.environ['KUBERNETES_SERVICE_HOST']
    except KeyError:
        log.error('KUBERNETES_SERVICE_HOST environment variable must be set.')
        exit(1)

    try:
        args.KUBERNETES_SERVICE_PORT = os.environ['KUBERNETES_SERVICE_PORT']
    except KeyError:
        log.error('KUBERNETES_SERVICE_PORT environment variable must be set.')
        exit(1)

    try:
        args.KUBERNETES_TOKEN = os.environ['KUBERNETES_TOKEN']
    except KeyError:
        log.error('KUBERNETES_TOKEN environment variable must be set.')
        exit(1)

    try:
        args.SERVICE_CERT_FILENAME = os.environ['SERVICE_CERT_FILENAME']
    except KeyError:
        try:
            service_cert = os.environ['SERVICE_CERT']
            args.SERVICE_CERT_FILENAME = args.temp_cert_file = args.WORKING_DIR + '/ca.crt'
            f = open(args.SERVICE_CERT_FILENAME, 'w')
            f.write(service_cert)
            f.close()
            os.chmod(args.SERVICE_CERT_FILENAME, 0o600)
        except KeyError:
            log.error(
                'Either SERVICE_CERT_FILENAME or SERVICE_CERT environment variable must be set.'
            )
            exit(1)

    return args
コード例 #31
0
def download_image(source, destination):
    try:
        urllib.request.urlretrieve(source, destination)
    except urllib.error.URLError as e:
        logging.error(e)
        raise urllib.error.URLError