Python loads Examples, ndjson.loads Python Examples

Example #1

0

Show file

File: tools.py Project: Dabendorf/Dabendorf-Traincoloureur

def get_vbb_data(centre):
	global stations
	global station_types
	g = Graph()
	with open('nodes.ndjson') as f:
		dataSta = ndjson.load(f)

	# convert to and from objects
	textSta = ndjson.dumps(dataSta)
	dataSta = ndjson.loads(textSta)
	for i in dataSta:
		#tupel = str(i['metadata']['x'])+","+str(i['metadata']['y'])
		x = float(i['metadata']['longitude'])
		y = float(i['metadata']['latitude'])
		idSt = str(i['id'])
		g.add_node(idSt)
		stations[idSt] = (x, y)
		# g.add_node(tupel)

	with open('edges.ndjson') as f:
		dataDist = ndjson.load(f)

	# convert to and from objects
	textDist = ndjson.dumps(dataDist)
	dataDist = ndjson.loads(textDist)

	for i in dataDist:
		stationA = str(i['source'])
		stationB = str(i['target'])
		distance = int(i['metadata']['time'])
		line = i['metadata']['line']
		if line.startswith('RB') or line.startswith('RB'):
			station_types[stationA] = 1
			station_types[stationB] = 1
		elif line.startswith('U') or line.startswith('S'):
			if stationA in station_types:
				if station_types[stationA] > 1:
					station_types[stationA] = 2
			else:
				station_types[stationA] = 2
			if stationB in station_types:
				if station_types[stationB] > 1:
					station_types[stationB] = 2
			else:
				station_types[stationB] = 2
		else:
			if stationA in station_types:
				if station_types[stationA] > 2:
					station_types[stationA] = 3
			else:
				station_types[stationA] = 3

			if stationB in station_types:
				if station_types[stationB] > 2:
					station_types[stationB] = 3
			else:
				station_types[stationB] = 3
		g.add_edge(stationA, stationB, distance)

	return dijsktra(g, centre)  # Station name of Dabendorf node: 900000245024

Example #2

0

Show file

File: get_json.py Project: markreg/RatingGraph

def get_json(username, game_mode="bullet", update=True, ensure_complete=False):
    json_file_path = f'data\lichess_{username}_{game_mode}.json'
    url = f'https://lichess.org/api/games/user/{username}'
    headers = {'Accept': 'application/x-ndjson'}
    parameters = {'rated': 'true', 'perfType': game_mode, 'max': 500}
    json_file = Path(json_file_path)
    if not json_file.is_file():
        print(f"File {json_file_path} not found, downloading...")
        r = requests.get(url, headers=headers, params=parameters)
        print(f"Download complete.")
        ensure_complete = True
        with open(json_file_path, 'w') as f:
            json_games = ndjson.loads(r.text)
            ndjson.dump(json_games, f)
    else:
        with open(json_file, 'r') as file:
            json_games = ndjson.loads(file.read())

    if ensure_complete:
        until = json_games[-1]['createdAt']
        parameters['until'] = until
        old_games = True
        while old_games:
            until_date = datetime.fromtimestamp(until / 1000)
            print(f"Checking games before {until_date:%d/%m/%y %H:%M}...")
            r = requests.get(url, headers=headers, params=parameters)
            old_games = ndjson.loads(r.text)
            if old_games:
                until = old_games[-1]['createdAt']
                parameters['until'] = until
                print(f'Found {len(old_games)} older games.')
                json_games += old_games
                with open(json_file_path, 'a') as f:
                    f.write('\n')
                    ndjson.dump(old_games, f)
            else:
                print('No older games found')

    if not update:
        return json_games

    since = json_games[0]['createdAt']
    parameters['since'] = since
    del parameters['max']
    since_date = datetime.fromtimestamp(since / 1000)
    print(f"Checking games after {since_date:%d/%m/%y %H:%M}...")

    r = requests.get(url, headers=headers, params=parameters)
    new_games = ndjson.loads(r.content)
    if new_games:
        print(f'Found {len(new_games)} new games')
        with open(json_file_path, 'w') as f:
            ndjson.dump(new_games + json_games, f)
    else:
        print('No newer games found')

    return new_games + json_games

Example #3

0

Show file

def handler(event, context):

  s3 = boto3.resource('s3')
  obj = s3.Object("abalustre-btc-contracts", event["Records"][0]['s3']['object']['key'])
  body = obj.get()["Body"].read().decode("utf-8")

  es = Elasticsearch(
      cloud_id = (os.environ["ELASTICID"]),
      http_auth = (os.environ["ELASTICUSER"], os.environ["ELASTICPASSWORD"]))

  read_lines = read_file_lines(body) # read file into list of lines
  tg_list = target_and_group_list()
  g_list = segmenta_imbarq(lines=read_lines, target_group_list=tg_list)
  btc_bulk = g_list[6]

  btc_obj_list = build_btc_contract_info(btc_bulk)

  btc_contract_list = []

  for i in range(len(btc_obj_list)):
    obj = btc_obj_list[i]
    contract_id = obj["contract-number"]
    position_date = obj["position-date"].strftime("%Y-%m-%d")
    btc_contract_list({
      "_index": "daily-position",
      "_type": "_doc",
      "_id": "C:"+contract_id+"_D:"+position_date,
      "_source": obj
    })

  json_data = '\n'.join(json.dumps(contract) for contract in btc_contract_list)
  data_post = ndjson.loads(json_data)
  helpers.bulk(es, data_post)
  
  return btc_contract_list

Example #4

0

Show file

def is_dnsdbflex(data: str) -> bool:
    """Check if the supplied data conforms to the dnsdbflex output (which only contains rrname and rrtype)

    Parameters
    ----------
      ndjson data as a string

    Returns
    -------
      True or False

    Raises
    --------
      none
    """

    try:
        j = ndjson.loads(data)
        for line in j:
            if not set(line.keys()) == {'rrname', 'rrtype'}:
                return False  # shortcut. We assume it's not if a single line does not conform
        return True
    except Exception as ex:
        print(
            "oops, this should not have happened. Maybe not an ndjson file? Reason: %s"
            % (str(ex), ),
            file=sys.stderr)
        return False

Example #5

0

Show file

File: export_kibana_config.py Project: zha0/RedELK

def fetch_kibana_object(obj_type, exportpath):
    try:
        print('# Fetching kibana objects: %s' % obj_type)
        response = requests.post(KIBANA_OBJECTS_EXPORT_URL, json={'type':obj_type}, verify=False, auth=(KIBANA_USER,KIBANA_PASS), headers={'kbn-xsrf':'true'})
        if response.status_code != 200:
            print('!!! Error fetching kibana object %s: HTTP status code %s' % (obj_type, response.status_code))
        else:
            rawData = response.text.encode('utf-8')
            items = ndjson.loads(rawData)
            if obj_type != 'index-pattern':
                toExport = []
                for ip in items:
                    if 'attributes' in ip.keys() and 'title' in ip['attributes']:
                        if re.match(REDELK_OBJ_FILTER, ip['attributes']['title'], re.IGNORECASE):
                          ip.pop('updated_at', None)
                          ip['version'] = '1'
                          toExport.append(ip)
                export_file = os.path.join(exportpath, '%s%s.ndjson' % (EXPORT_FILES_PREFIX_KIBANA, obj_type))
                print('\tExporting %s: %s' % (obj_type, export_file))
                with open(export_file, 'w') as f:
                    ndjson.dump(toExport, f)
            else:
                for ip in items:
                    if 'attributes' in ip.keys() and 'title' in ip['attributes']:
                        if re.match(INDEX_PATTERNS_FILTER, ip['attributes']['title'], re.IGNORECASE):
                            #print('%s: %s' % (obj_type,ip['attributes']['title']))
                            pn = ip['attributes']['title'][:-2] if ip['attributes']['title'].endswith('-*') else ip['attributes']['title']
                            ip.pop('updated_at', None)
                            ip['version'] = '1'
                            export_file = os.path.join(exportpath, '%s%s_%s.ndjson' % (EXPORT_FILES_PREFIX_KIBANA, obj_type, pn))
                            print('\tExporting %s: %s' % (obj_type, export_file))
                            with open(export_file, 'w') as f:
                                ndjson.dump([ip], f)
    except Exception as e:
        print('!!! Error fetching kibana object %s: %s' % (obj_type, e))

Example #6

0

Show file

File: edge_live_monitoring_demo.py Project: phagunbaya/examples

    def populate_explanations(self, x_out, ignore_first=False):
        r_index = None
        out_x = None
        xp_map = self.columnInfo.xpMap
        if x_out.status_code == 200:
            out_x = ndjson.loads(x_out.content.decode())
        isFirst = True
        for item in out_x:
            if ignore_first and isFirst:
                isFirst = False
                continue
            oitem = self.outputMap.get(item['time'])
            if oitem is None:
                oput = FalkonryOutput(self.columnInfo)
                oput.set_entity_batch(item)
                oput.time = item['time']
                oput.explanations[xp_map[item['signal']]] = item['score']
                self.outputMap[oput.time] = oput

                self.timeArrowList.append({"ts": oput.time, "complete": 0})
            else:
                oitem.explanations[xp_map[item['signal']]] = item['score']

            r_index = item['index']
        return r_index

Example #7

0

Show file

def get_posting_list_for_token(token: str) -> dict:
    """
    Get posting list of token

    """
    posting_list: dict = dict()
    for file in file_names:
        if token <= file:
            with open(FINAL_INDEX_PATH + "{}.json".format(file),
                      "r") as file_r:  # ,buffering=const_size_in_bytes
                while file_r:  # while not eof
                    line = file_r.readline()
                    if line:
                        term, posting_list_r = ndjson.loads(line)[0]
                        if token == term:  # if found token in document
                            print("Token '{}' is processing".format(token))
                            return dict(
                                sorted(posting_list_r.items(),
                                       key=lambda x: int(x[0])))
                        elif token < term:  # optimization, not go through all document, if not found token,
                            # but token already bigger value than term
                            print("Word '{}' not found across all documents".
                                  format(token))
                            return posting_list
    return posting_list

Example #8

0

Show file

File: edge_live_monitoring_demo.py Project: phagunbaya/examples

    def _populate_value(self, c_input, isCondition, ignore_first=False):
        """
        Common method to populate condition/prediction label, and confidence score.
        """
        r_index = None
        out_c = None
        if c_input.status_code == 200:
            out_c = ndjson.loads(c_input.content.decode())
        isFirst = True
        for item in out_c:
            if ignore_first and isFirst:
                isFirst = False
                continue
            oitem = self.outputMap.get(item['time'])
            if oitem is None:
                oitem = FalkonryOutput(self.columnInfo)
                oitem.set_entity_batch(item)
                oitem.time = item['time']
                self.outputMap[oitem.time] = oitem
                self.timeArrowList.append({"ts": oitem.time, "complete": 0})

            if (isCondition):
                oitem.condition = item['value']
            else:
                oitem.confidence = item['value']
            r_index = item['index']
        return r_index

Example #9

0

Show file

def get_last(channel_id):
    read_storage_client = storage.Client()
    bucket_name = 'airqo-bucket'
    filename = 'channel%s.json'%channel_id
    
    bucket = read_storage_client.get_bucket(bucket_name)
    stats = storage.Blob(bucket=bucket, name=filename).exists(storage_client)
    #size= storage.get_blob(bucket=bucket, name=filename).chunksize
    if not stats:
        last_id = 0
        last_time = None
    else:
        blob = bucket.get_blob(filename)
        json_data_string = blob.download_as_string()
        json_data=ndjson.loads(json_data_string)
        json_list = []
        for item in json_data:
            json_list.append(item)
          
        if len(json_list) != 0:
            last_id = json_list[-1]['entry_id']
            last_time = str_to_date(json_list[-1]['created_at'])
        else:
            last_id= None
            last_time=None
    
    return last_id,last_time

Example #10

0

Show file

def parse_and_insert_dnsdbflex(data: str):
    """Parse and validate the more simplier dndsdbflex output data.

    Parameters
    ----------
      data as a string

    Returns
    -------
      A dict with either the error message or the data which may be sent off the the caller of handler()

    Raises
    --------
      none
    """
    objects = []
    try:
        entries = ndjson.loads(data)
        for entry in entries:  # iterate over all ndjson lines
            # validate here (simple validation or full JSON Schema validation)
            if not validate_dnsdbflex(entry):
                return {
                    "error":
                    "Could not validate the dnsdbflex input '%s'" % entry
                }

            # Next, extract some fields
            rrtype = entry['rrtype'].upper()
            rrname = entry['rrname'].rstrip('.')

            # create a new MISP object, based on the passive-dns object for each nd-JSON line
            try:
                o = MISPObject(name='passive-dns',
                               standalone=False,
                               distribution=0,
                               comment='DNSDBFLEX import by cof2misp')
                o.add_attribute('rrtype',
                                value=rrtype,
                                distribution=0,
                                comment='DNSDBFLEX import by cof2misp')
                o.add_attribute('rrname',
                                value=rrname,
                                distribution=0,
                                comment='DNSDBFLEX import by cof2misp')
            except Exception as ex:
                print("could not create object. Reason: %s" % str(ex))

            #
            # add dnsdbflex entry to MISP object
            #
            objects.append(o.to_json())

        r = {'results': {'Object': [json.loads(o) for o in objects]}}
    except Exception as ex:
        misperrors[
            "error"] = "An error occured during parsing of input: '%s'" % (
                str(ex), )
        return misperrors
    return r

Example #11

0

Show file

File: Dahua-VTH-SecPanel.py Project: lolzor7/Home_Automation

    def VTH_SetSecPanel(self, CVQ6081_Alarm):    

        if CVQ6081_Alarm == 0:      
            self.AlarmEnable = False
        else: 
            self.AlarmEnable = True            
            self.AlarmProfile = AlarmProfile[ CVQ6081_Alarm ]   

        query_args = {
                "method":"configManager.setConfig",
                "magic" : "0x1234",
                "params":{
                    "table":{
                        "AlarmEnable"    : self.AlarmEnable,
                        "CurrentProfile" : self.AlarmProfile,
                        "ProfileEnable"  : True,
                        "Profiles"       : self.AlarmConfig
                    },
                    "name":"CommGlobal",
                },
                "session":self.SessionID,
                "id":self.ID
        }
                
        if verbose: log.info("[" + str(datetime.datetime.now()) + " VTH_BOX] Updating to: {}".format(query_args))    
        if verbose: log.info("VTH_SetSecPanel Service Call request: {}".format(json.dumps(query_args)))
        
        data = self.P2P(json.dumps(query_args))
        
        if data == None:
            log.failure("[" + str(datetime.datetime.now()) + " VTH_BOX-P2P_FAILURE] SetSecPanel Failed - No answer" )
            self.VTH_ON_LINE = self.P2P_traceError() # KO 
            return False
        elif len(data) == 1:            
            if verbose: log.info("P2P-1. VTH_SetSecPanel Service Call answer: {}".format(data))
            data = json.loads(data)
            if data.get('result'):
                self.VTH_ON_LINE = self.P2P_traceError() # KO
                return False
            else: 
                self.VTH_ON_LINE = self.P2P_traceError() # KO
                return False
        else:
            if verbose: log.info("P2P-2. VTH_SetSecPanel Service Call answer: {}".format(data))
            data = ndjson.loads(data)               
            if data[0].get('method') == "client.notifyConfigChange":
                self.AlarmEnable  = data[0]['params']['table']['AlarmEnable']
                self.AlarmProfile = data[0]['params']['table']['CurrentProfile']
                self.AlarmConfig  = data[0]['params']['table']['Profiles']
                if verbose: log.info("[" + str(datetime.datetime.now()) + " VTH_BOX-AlarmEnable] been changed remotely to: {}".format(self.AlarmEnable)) 
                if verbose: log.info("[" + str(datetime.datetime.now()) + " VTH_BOX-AlarmProfile] is: {}".format(self.AlarmProfile))
                if verbose: log.info("[" + str(datetime.datetime.now()) + " VTH_BOX-AlarmConfiguration] is: {}".format(self.AlarmConfig))
                if not self.AlarmEnable:
                    AlarmToken['nvalue'] = 0
                else: 
                    AlarmToken['nvalue'] = VTHAlarmProfile[ self.AlarmProfile ]              
                
        self.VTH_ON_LINE = 0 # OK
        return True

Example #12

0

Show file

File: varna.py Project: sudheer-jerry/varna

 def load_data():
     data = ndjson.loads(req_data["data"])
     for event in data:
         if type(event) == type([]):
             for e in event:
                 yield e
         else:
             yield event

Example #13

0

Show file

 def get(self):
     covid_19 = Covid_19(storage.Client())
     covid_19_blob = covid_19.get_json()
     if (covid_19_blob != None and covid_19_blob.exists() == False):
         abort(400, "No such data found")
     elif (covid_19_blob == None):
         return "Error fetching file"
     return ndjson.loads(covid_19_blob.download_as_string())

Example #14

0

Show file

 def _deserialize_ndjson_string(byte_string) -> List[object]:
     """
     Deserialize the contents of a newline-delimited JSON string to a list
     Args:
         byte_string: The NDJSON contents to be deserialized
     Returns:
         list: Each individual JSON entry deserialized as Python objects
     """
     utf8_string = str(byte_string, 'utf-8')
     content = ndjson.loads(utf8_string)
     return content

Example #15

0

Show file

 async def stream_game_state(self, game):
     url = 'https://lichess.org/api/bot/game/stream/'+game
     async with self.session.get(url) as response:
         chunk = await response.content.read(0x100000)
         print(chunk)
         try:
             chunk = ndjson.loads(chunk)
             print(chunk)
         except Exception as e:
             print('Error parsing game stream:', e)
             exit()
         return chunk

Example #16

0

Show file

File: bulk_import_request.py Project: enaserianhanzaei/labelbox-python

    def _fetch_remote_ndjson(self, url: str) -> List[Dict[str, Any]]:
        """
        Fetches the remote ndjson file and caches the results.

        Args:
            url (str): Can be any url pointing to an ndjson file.
        Returns:
            ndjson as a list of dicts.
        """
        response = requests.get(url)
        response.raise_for_status()
        return ndjson.loads(response.text)

Example #17

0

Show file

def main():

    # Read the data from Google Cloud Storage
    read_storage_client = storage.Client()

    # Set buckets and filenames
    bucket_name = "all_alerts_7_21"  #I'm  using project: cops-cloudmonus-nonprod-563b in mckesson's gcp
    filename = "sample_json.json"

    # get bucket with name
    bucket = read_storage_client.get_bucket(bucket_name)

    # get bucket data as blob
    blob = bucket.get_blob(filename)

    # convert to string
    json_data_string = blob.download_as_string()
    #print(json_data_string)
    json_data = ndjson.loads(json_data_string)
    #print(json_data)
    #json_data = json.loads(json_data_string)
    list = []
    for item in json_data:
        list.append(item)
        #print(item)
        #print(item['Website'])

    list1 = list[0:len(list)]
    print(list1)

    #removing something
    list_less = []
    for item in list1:
        if item["Website"] != "Yandex":
            list_less.append(item)

    result = ""
    for item in list_less:
        item2 = json.dumps(item)
        result = result + str(item2) + "\n"

    #adding something else
    #item={"Website": "Yandex", "URL": "Yandex.com", "ID": 4}
    #item2=json.dumps(item)
    #result = result + str(item2) + "\n"

    print(result)

    #result_json=json.dumps(result)
    #print(result_json)

    # Write the data to Google Cloud Storage
    """

Example #18

0

Show file

File: PublisherApp.py Project: anupal01/PubSubAndBigQueryIntegration

def write_data(data, topicname):
    list = ndjson.loads(data)
    for eachline in list:
        msg = str(eachline)
        futures.update({msg: None})
        # When you publish a message, the client returns a future.
        future = publisher.publish(
            topicname,
            msg.encode("utf-8")  # data must be a bytestring.
        )
        futures[msg] = future
        # Publish failures shall be handled in the callback function.
        future.add_done_callback(get_callback(future, msg))

Example #19

0

Show file

def test_delete_log():
    resp1 = requests.post("http://txlogging:8080/log", json=msg)

    assert resp1.status_code == 200

    time.sleep(10)

    resp2 = requests.get("http://txlogging:8080/log", params={})

    assert resp2.status_code == 200
    assert len(ndjson.loads(resp2.text)) == 1

    resp3 = requests.delete("http://txlogging:8080/log")

    assert resp3.status_code == 200

    resp4 = requests.get("http://txlogging:8080/log", params={})

    assert resp4.status_code == 200
    assert len(ndjson.loads(resp4.text)) == 0

    requests.delete("http://txlogging:8080/log")

Example #20

0

Show file

File: ndjson_crawler.py Project: Datenworks/datahub

 def crawler(self, data: str):
     data = ndjson.loads(data)
     self.json_schema = self.__get_json_schema(data)
     columns = []
     for column_name, column_prop in self.json_schema['items']['properties'].items():
         column = {
             'name': column_name,
             'type': self.__get_type(column_prop),
             'nullable': self.__is_nullable(column_prop),
             'default': None
         }
         columns.append(column)
     return columns

Example #21

0

Show file

def test_logging():

    resp1 = requests.post("http://txlogging:8080/log", json=msg)

    assert resp1.status_code == 200

    time.sleep(10)

    resp2 = requests.get("http://txlogging:8080/log", params={})

    assert resp2.status_code == 200
    assert len(ndjson.loads(resp2.text)) == 1

    start = "2001-01-01T00:00:00-01:00"
    end = "2002-01-01T00:00:00-01:00"

    resp3 = requests.get("http://txlogging:8080/log",
                         params={
                             "start": start,
                             "end": end
                         })

    assert resp3.status_code == 200
    assert len(ndjson.loads(resp3.text)) == 1

    start = "2002-01-01T00:00:00-01:00"
    end = "2003-01-01T00:00:00-01:00"

    resp4 = requests.get("http://txlogging:8080/log",
                         params={
                             "start": start,
                             "end": end
                         })

    assert resp4.status_code == 200
    assert len(ndjson.loads(resp4.text)) == 0

    requests.delete("http://txlogging:8080/log")

Example #22

0

Show file

File: varna.py Project: sudheer-jerry/varna

def get_cloudtrail_file(key_prefix):
    s3 = boto3.resource('s3')
    b = s3.Bucket(settings["logs_bucket"])
    result = []
    for obj in b.objects.filter(Prefix=key_prefix):
        print(obj.key)
        s3_object = s3.Object(settings["logs_bucket"], obj.key).get()
        object_content = s3_object['Body'].read()
        results = gzip.decompress(object_content)
        for i in ndjson.loads(results)[0]['Records']:
            i['event_type'] = i['eventType']
            i['event_time'] = int(get_time(i).timestamp())
            result.append(i)
    return result

Example #23

0

Show file

File: to_png.py Project: EzgiDede/sketch_input

def open_and_read(file_path):
    # file path is the path of the file as a string.
    object_list = []
    with open(file_path) as f:
        for line in f:
            if line != "\n":
                stroke_dict_list = ndjson.loads(line)
                stroke_dict = stroke_dict_list[0]
                # now I have a dictionary named stroke_dict.
                my_stroke_list = stroke_dict.get("drawing")
                my_stroke_key_ids = stroke_dict.get("key_id")
                object_list.append([my_stroke_list, my_stroke_key_ids])

    return object_list

Example #24

0

Show file

File: toolbox.py Project: ggiesa/AutonoTrader

def decode_api_response(url):
    '''Return JSON from url response.'''

    response = requests.get(url)
    status = response.status_code

    try:
        response = response.json()
    except:
        try:
            response = ndjson.loads(response.text)
        except:
            print("Bad response")
            response = None

    return (status, response)

Example #25

0

Show file

 async def stream(self):
     url = 'https://lichess.org/api/stream/event'
     while 1:
         async with self.session.get(url) as response:
             chunk = await response.content.read(0x100000)
             try:
                 chunk = ndjson.loads(chunk)
                 print(chunk)
             except Exception as e:
                 print('ERROR:', e)
                 time.sleep(2)
                 continue
             if len(chunk) > 0:
                 return chunk
             print('waiting for something to happen')
             time.sleep(4)

Example #26

0

Show file

def remove_reference(text):
    new_text = []
    new_references = []
    try:
        text = json.loads(text)
        for ref in text['references']:
            if not ref['id'].startswith('AX') and len(ref['id']) != 20:
                new_references.append(remove_obj_keys(ref))
        text['references'] = new_references
    except json.decoder.JSONDecodeError:
        text = ndjson.loads(text)
        for ref in text:
            if (not ref['references'][0]['id'].startswith('AX')
                    and len(ref['references'][0]['id']) != 20):
                new_text.append(remove_obj_keys(ref))

    return json.dumps(new_text) if new_text else json.dumps(text)

Example #27

0

Show file

File: helpers.py Project: guyandtheworld/fetch-and-index-global-news

def process_company_json(record: dict, bucket):
    """
    fetches file and stores it locally to fetch and preprocess
    returns the processed articles

    ## args
    * record: a remote raw json file storage
    * bucket: Google Bucket Instance
    * metadata: details regarding the company and the source

    ## returns
    processed articles based on MongoDB Article model
    """

    blob = bucket.blob(record["source_file"])

    # convert to string
    json_data_string = blob.download_as_string()

    # returns a list
    data = ndjson.loads(json_data_string)
    logging.info("data storage length: {}".format(len(data)))

    processor = getattr(source_processor, record["source"])
    logging.info("processing: {}".format(record["source_file"]))
    processed_records = processor(data[0], record["entity_id"],
                                  record["scenario_id"], record["source_file"])

    # delete duplicates here
    df = pd.DataFrame(processed_records,
                      columns=[
                          "uuid", "entityID_id", "scenarioID_id", "title",
                          "unique_hash", "url", "search_keyword",
                          "published_date", "internal_source", "domain",
                          "language", "source_country", "raw_file_source",
                          "entry_created"
                      ])

    before = df.shape
    df.drop_duplicates(subset='url', keep="first", inplace=True)
    after = df.shape
    logging.info("Before: {}, After: {}".format(before, after))

    processed_records = df.values.tolist()
    return processed_records

Example #28

0

Show file

def raise_response_error(r):
    if r.status_code >= 400:
        print("==== Response Debugging ====")
        print("##Request Headers", r.request.headers)

        # extract content type
        ct = r.headers["content-type"].split(";")[0]
        if ct == ContentType.JSON.value:
            dump = dump_response(r)
            print(dump)
            print("##Response:", dump.decode("UTF-8"))
            err = dacite.from_dict(data_class=Error, data=r.json())
            print(err)
        elif ct == ContentType.NDJSON.value:
            decoded = ndjson.loads(r.text)
            print("##Response:", decoded)

        r.raise_for_status()

Example #29

0

Show file

def handler(event, context):

    s3 = boto3.resource('s3')
    obj = s3.Object("abalustre-btc-contracts",
                    event["Records"][0]['s3']['object']['key'])
    body = obj.get()["Body"].read()
    #print(body[1])
    lines = body.splitlines()
    #print(str(lines[1], 'ISO-8859-1'))
    #  print(str(lines[10], 'utf-8'))
    #  line10 = str(lines[10], 'utf-8')
    #  print(line10)
    #  utf_lines = []
    #  for i in range(len(lines)):
    #    utf_lines.append(str(lines[i], 'utf-8'))
    #  print(utf_lines)
    es = Elasticsearch(cloud_id=(os.environ["ELASTICID"]),
                       http_auth=(os.environ["ELASTICUSER"],
                                  os.environ["ELASTICPASSWORD"]))
    tg_list = target_and_group_list()
    g_list = segmenta_imbarq(lines=lines, target_group_list=tg_list)
    btc_bulk = g_list[6]
    btc_obj_list = build_btc_contract_info(btc_bulk)
    btc_contract_list = []
    for i in range(len(btc_obj_list)):
        obj = btc_obj_list[i]
        print(type(obj))
        print(obj)
        contract_id = obj['contract_number']
        position_date = obj['position_date']
        btc_contract_list.append({
            "_index": "btc-contracts",
            "_type": "_doc",
            "_id": "C:" + contract_id + "_D:" + position_date,
            "_source": obj
        })

    json_data = '\n'.join(
        json.dumps(contract) for contract in btc_contract_list)
    data_post = ndjson.loads(json_data)
    helpers.bulk(es, data_post)

    return btc_contract_list

Example #30

0

Show file

File: bulk_import_request.py Project: enaserianhanzaei/labelbox-python

    def create_from_url(cls,
                        client,
                        project_id: str,
                        name: str,
                        url: str,
                        validate=True) -> 'BulkImportRequest':
        """
        Creates a BulkImportRequest from a publicly accessible URL
        to an ndjson file with predictions.

        Args:
            client (Client): a Labelbox client
            project_id (str): id of project for which predictions will be imported
            name (str): name of BulkImportRequest
            url (str): publicly accessible URL pointing to ndjson file containing predictions
            validate (bool): a flag indicating if there should be a validation
                if `url` is valid ndjson
        Returns:
            BulkImportRequest object
        """
        if validate:
            logger.warn(
                "Validation is turned on. The file will be downloaded locally and processed before uploading."
            )
            res = requests.get(url)
            data = ndjson.loads(res.text)
            _validate_ndjson(data, client.get_project(project_id))

        query_str = """mutation createBulkImportRequestPyApi(
                $projectId: ID!, $name: String!, $fileUrl: String!) {
            createBulkImportRequest(data: {
                projectId: $projectId,
                name: $name,
                fileUrl: $fileUrl
            }) {
                %s
            }
        }
        """ % query.results_query_part(cls)
        params = {"projectId": project_id, "name": name, "fileUrl": url}
        bulk_import_request_response = client.execute(query_str, params=params)
        return cls(client,
                   bulk_import_request_response["createBulkImportRequest"])