import random

import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

schema = avro.schema.Parse(open("user.avsc", "rb").read())

writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), schema)
for i in range(0, 10000):
    if i % 3 == 0:
        country = 'cn'
        hobby = None
    else:
        country = 'us'
        hobby = 'drink'
    if i % 4 == 0:
        name = 'lisi'
    elif i % 3 == 0:
        name = 'zhangsan'
    else:
        name = 'wangwu'
    age = random.randint(1, 99)

    writer.append({
        "id": i,
        "name": name,
        "age": age,
        "country": country,
        "hobby": hobby
    })
Esempio n. 2
0
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

d = {
    'PackageID' : 1539,
    'PersonID' : 33,
    'Name' : """MEGA_GAMER_2222""",
    'Inventory': dict((str(i),i) for i in iter(range(100))),  
    'CurrentLocation': """
		Pentos is a large port city, more populous than Astapor on Slaver Bay, 
		and may be one of the most populous of the Free Cities. 
		It lies on the bay of Pentos off the narrow sea, with the Flatlands 
		plains and Velvet Hills to the east.
		The city has many square brick towers, controlled by the spice traders. 
		Most of the roofing is done in tiles. There is a large red temple in 
		Pentos, along with the manse of Illyrio Mopatis and the Sunrise Gate 
		allows the traveler to exit the city to the east, 
		in the direction of the Rhoyne.
		"""
}

schema = avro.schema.parse(open("ApacheAvro/person.avsc", "rb").read())
writer = DataFileWriter(open("ApacheAvro/people.avro", "wb"), DatumWriter(), schema)
writer.append(d)
writer.close()
with open('ApacheAvro/people.avro', 'br') as file:
    src = file.read()
    file.close()
writer = DataFileWriter(open("ApacheAvro/people.avro", "wb"), DatumWriter(), schema)
Esempio n. 3
0
        # decode UTF-8 back to Unicode, cell by cell:
        yield [unicode(cell, 'utf-8') for cell in row]


def utf_8_encoder(unicode_csv_data):
    for line in unicode_csv_data:
        yield line.encode('utf-8')


schema = avro.schema.parse(open("data/song.avsc").read())

with codecs.open('data/subset_unique_tracks.txt', 'r',
                 encoding='latin_1') as csvfile:
    reader = unicode_csv_reader(csvfile, delimiter='|')
    writer = DataFileWriter(open("data/songs.avro", "w"),
                            DatumWriter(),
                            schema,
                            codec='deflate')
    for count, row in enumerate(reader):
        print count
        try:
            writer.append({
                "id1": row[0],
                "id2": row[1],
                "artist": row[2],
                "song": row[3]
            })
        except IndexError:
            print "Bad record, skip."
    writer.close()

#Uncomment to read and print the data from the Avro file
Esempio n. 4
0
 def _produce_test_input(self):
     schema = avro.schema.parse("""
     {
       "type":"record",
       "name":"TrackEntity2",
       "namespace":"com.spotify.entity.schema",
       "doc":"Track entity merged from various sources",
       "fields":[
         {
           "name":"map_record",
           "type":{
             "type":"map",
             "values":{
               "type":"record",
               "name":"MapNestedRecordObj",
               "doc":"Nested Record in a map doc",
               "fields":[
                 {
                   "name":"element1",
                   "type":"string",
                   "doc":"element 1 doc"
                 },
                 {
                   "name":"element2",
                   "type":[
                     "null",
                     "string"
                   ],
                   "doc":"element 2 doc"
                 }
               ]
             }
           },
           "doc":"doc for map"
         },
         {
           "name":"additional",
           "type":{
             "type":"map",
             "values":"string"
           },
           "doc":"doc for second map record"
         },
         {
           "name":"track_gid",
           "type":"string",
           "doc":"Track GID in hexadecimal string"
         },
         {
           "name":"track_uri",
           "type":"string",
           "doc":"Track URI in base62 string"
         },
         {
           "name":"Suit",
           "type":{
             "type":"enum",
             "name":"Suit",
             "doc":"enum documentation broz",
             "symbols":[
               "SPADES",
               "HEARTS",
               "DIAMONDS",
               "CLUBS"
             ]
           }
         },
         {
           "name":"FakeRecord",
           "type":{
             "type":"record",
             "name":"FakeRecord",
             "namespace":"com.spotify.data.types.coolType",
             "doc":"My Fake Record doc",
             "fields":[
               {
                 "name":"coolName",
                 "type":"string",
                 "doc":"Cool Name doc"
               }
             ]
           }
         },
         {
           "name":"master_metadata",
           "type":[
             "null",
             {
               "type":"record",
               "name":"MasterMetadata",
               "namespace":"com.spotify.data.types.metadata",
               "doc":"metadoc",
               "fields":[
                 {
                   "name":"track",
                   "type":[
                     "null",
                     {
                       "type":"record",
                       "name":"Track",
                       "doc":"Sqoop import of track",
                       "fields":[
                         {
                           "name":"id",
                           "type":[
                             "null",
                             "int"
                           ],
                           "doc":"id description field",
                           "default":null,
                           "columnName":"id",
                           "sqlType":"4"
                         },
                         {
                           "name":"name",
                           "type":[
                             "null",
                             "string"
                           ],
                           "doc":"name description field",
                           "default":null,
                           "columnName":"name",
                           "sqlType":"12"
                         }
                       ],
                       "tableName":"track"
                     }
                   ],
                   "default":null
                 }
               ]
             }
           ]
         },
         {
           "name":"children",
           "type":{
             "type":"array",
             "items":{
               "type":"record",
               "name":"Child",
               "doc":"array of children documentation",
               "fields":[
                 {
                   "name":"name",
                   "type":"string",
                   "doc":"my specific child\'s doc"
                 }
               ]
             }
           }
         }
       ]
     }""")
     self.addCleanup(os.remove, "tmp.avro")
     writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema)
     writer.append({
         u'track_gid': u'Cool guid',
         u'map_record': {
             u'Cool key': {
                 u'element1': u'element 1 data',
                 u'element2': u'element 2 data'
             }
         },
         u'additional': {
             u'key1': u'value1'
         },
         u'master_metadata': {
             u'track': {
                 u'id': 1,
                 u'name': u'Cool Track Name'
             }
         },
         u'track_uri': u'Totally a url here',
         u'FakeRecord': {
             u'coolName': u'Cool Fake Record Name'
         },
         u'Suit': u'DIAMONDS',
         u'children': [{
             u'name': u'Bob'
         }, {
             u'name': u'Joe'
         }]
     })
     writer.close()
     self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
Esempio n. 5
0
model_id = 'recommender-model-' + '{:%Y-%m-%d-%H:%M:%S}'.format(
    datetime.datetime.now())

# ## Generate Avro with Schema

# In[30]:

## Generate avro directly

# Parse the schema file
schema = avro.schema.Parse(open("avro/RecommenderModel.avsc", "rb").read())

# Create a data file using DataFileWriter
dataFile = open(model_path + "recommender.avro", "wb")

writer = DataFileWriter(dataFile, DatumWriter(), schema)

# Write data using DatumWriter
writer.append({
    "modelId": model_id,
    "tensorFlowModel": model_file_binary,
    "productMap": productMapping,
    "customerMap": customerMapping
})

writer.close()

# In[31]:

reader = DataFileReader(open(model_path + "recommender.avro", "rb"),
                        DatumReader())
Esempio n. 6
0
    def file_write(self, fname, data):
        "Write documents in append mode to given file name"
        # perform input data validation
        good_data = []
        # write bad data records into output file
        bdir = os.path.dirname(fname)
        bdir = '%s/bad' % bdir if bdir else '/tmp/bad'
        if not os.path.exists(bdir):
            os.makedirs(bdir)
        bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname))
        count = ecount = edocs = 0
        with open(bfname, 'a') as bstream:
            for rec in data:
                validator = RecordValidator()
                validator.run(self.schema_json, rec)
                if validator.errors:
                    bstream.write(json.dumps(rec) + '\n')
                    for err in validator.errors:
                        msg = 'SCHEMA ERROR '
                        for key, val in err.items():
                            msg += '%s: %s ' % (key.upper(), json.dumps(val))
                        bstream.write(msg + '\n')
                    bstream.write('-------------\n')
                    ecount += len(validator.errors)
                    edocs += 1
                else:
                    good_data.append(rec)
                count += 1
        if ecount:
            print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\
                    % (count, edocs, ecount, bfname))
        # use only good portion of the data
        data = good_data
        try:
            schema = self.schema
            wmaids = []
            if not hasattr(data, '__iter__') or isinstance(data, dict):
                data = [data]

            if os.path.exists(fname):
                schema = None  # we'll append to existing file
            mode = 'a+' if fname.endswith('.avro') else 'a'
            if mode == 'a':
                print(
                    "We're unable yet to implement read-write mode with compressed avro files"
                )
                raise NotImplementedError
            rec = None  # keep doc in case of failure
            with DataFileWriter(open_file(fname, mode), DatumWriter(),
                                schema) as writer:
                for rec in data:
                    writer.append(rec)
                    writer.flush()
                    wmaid = rec.get('wmaid', wmaHash(rec))
                    wmaids.append(wmaid)
            return wmaids
        except Exception as exc:
            err = traceback.format_exc(limit=1).splitlines()[-1]
            line = ' '.join(str(exc).replace('\n', '').split())
            msg = 'Failure in %s storage, error=%s, exception=%s' \
                    % (self.stype, err, line)
            msg += ' Failed document: '
            msg += json.dumps(rec)
            raise WriteError(msg)
Esempio n. 7
0
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

schema = avro.schema.parse(open("avro.schema", "rb").read())

reader = DataFileReader(open("cdn.avro", "rb"), DatumReader(schema))

writer = DataFileWriter(open("cdn-new.avro", "wb"), DatumWriter(), schema)

line = 0
linescopied = 0
for row in reader:
    if (line > 0):
        writer.append(row)
        linescopied += 1
    else:
        print(row)
    line += 1

print(str(linescopied) + " lines copied")

writer.close()
reader.close()
Esempio n. 8
0
        assert (type(j["type"]) is str)
        assert ("timestamp" in j)
        assert (type(j["timestamp"]) is str)

    json_wt = timeit.timeit(read_ujson_manualschema, number=n)
    print("python,ujson-manualschema,decoding,{},{:.2f}".format(n, json_wt))

print("JSON output size: {:,} bytes".format(os.path.getsize(json_filename)))

with open(avro_filename, "wb") as avro_f:

    def write_avro():
        avro_writer.append(event)

    avro_schema = avro.schema.Parse(open("event.avsc").read())
    avro_writer = DataFileWriter(avro_f, DatumWriter(), avro_schema)
    avro_t = timeit.timeit(write_avro, number=n)

    print("python,avro,encoding,{},{:.2f}".format(n, avro_t))
    avro_writer.close()

with open(avro_filename, "rb") as avro_f:

    def read_avro():
        u = reader.__iter__().__next__()

    reader = DataFileReader(avro_f, DatumReader())
    avro_wt = timeit.timeit(read_avro, number=n)
    print("python,avro,decoding,{},{:.2f}".format(n, avro_wt))
    reader.close()
Esempio n. 9
0
def avro_writer(path):
    with client.write(path, overwrite=True) as writer:
        with DataFileWriter(writer, DatumWriter(), schema,
                            codec='snappy') as avro_writer:
            for record in records:
                avro_writer.append(record)
Esempio n. 10
0
 def __init__(self, schemaFile, avroFile):
     self.schema = avro.schema.Parse(open(schemaFile, "rb").read())
     self.writer = DataFileWriter(open(avroFile, "wb"), DatumWriter(), self.schema)
    return [name for name in os.listdir(a_dir)
            if os.path.isdir(os.path.join(a_dir, name))]

# Constants
AVRO_SCHEMA_FILE = "../../../avroschema/WikiArticleLinked.avsc"

# Enter correct dir here
home_dir = ".../enwiki-latest-wikiextractor/"
subdirs = get_immediate_subdirectories(home_dir)
for sub_dir in subdirs:
    dir_path = home_dir + sub_dir

    # Avro file
    AVRO_FILE = "../../../avro/wiki-files/" + "-".join(dir_path.split("/")[-2:]) + ".avro"
    wiki_schema = avro.schema.Parse(open(AVRO_SCHEMA_FILE, "rb").read().decode("utf-8"))
    writer = DataFileWriter(open(AVRO_FILE, "wb"), DatumWriter(), wiki_schema)
    print("writing file " + AVRO_FILE + " ....")

    sub_sub_dirs = get_immediate_subdirectories(home_dir + sub_dir)

    weird_case = 0
    for sub_sub_dir in sub_sub_dirs:    
        # From each file, write into the avro schema
        files = os.listdir(dir_path + "/" + sub_sub_dir)
        #pdb.set_trace()
        for f in files:
            file_path = dir_path + "/" + sub_sub_dir + "/" + f
            with io.open(file_path) as f:
                for l in f.readlines():
                    # Get article + links
                    wiki_article = json.loads(l)
Esempio n. 12
0
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

schema = avro.schema.parse(open("user.avsc").read())

writer = DataFileWriter(open("/tmp/users.avro", "w"), DatumWriter(), schema)
writer.append({"name": "Alyssa", "favorite_number": 256, "WTF": 2})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()

reader = DataFileReader(open("/tmp/users.avro", "r"), DatumReader())
for user in reader:
    print user
reader.close()
Esempio n. 13
0
filtername = {}
airmass = {}
exptime = {}

# astrometric variables
NAXIS = np.zeros((2, 2))
CD = np.zeros((2, 2, 2))
nPV1 = 2
nPV2 = 11
PV = np.zeros((2, nPV1, nPV2))
CRVAL = np.zeros((2, 2))
CRPIX = np.zeros((2, 2))

schema = avro.schema.parse(open("hits.avsc", "rb").read())
writer = DataFileWriter(open("hits-demo.avro", "wb"), DatumWriter(), schema)

for field in fields:
    print(field)

    ccds = os.listdir("%s/%s" % (SHAREDDIR, field))

    for ccd in ccds:

        print(ccd)

        candidatesdir = "%s/%s/%s/CANDIDATES" % (SHAREDDIR, field, ccd)
        calibrationsdir = "%s/%s/%s/CALIBRATIONS" % (SHAREDDIR, field, ccd)

        if os.path.exists(candidatesdir):
Esempio n. 14
0
def main(f, ds):

    spark = SparkSession.builder.appName("lr_sample_train").getOrCreate()
    schema = avro.schema.Parse(open("lrAvro.avsc").read())
    writer = DataFileWriter(open("sample/%s.avro" % f, "wb"), DatumWriter(),
                            schema)

    df_sale = spark.sql(
        "select item_id, sale from songwt.item_sale where ds =%s" % ds)
    sale_dict = dict()
    for row in df_sale.collect():
        sale_dict[row.item_id] = row.sale

    sql = """
    SELECT devoruid, collect_set(categoryid) as prefer_cid from 
    (SELECT devoruid, categoryid, row_number() OVER (PARTITION BY devoruid ORDER BY score DESC) as num 
    from features.user_categoryid_preference WHERE ds=%s and score > 1) t where num <= 5 GROUP BY devoruid
    """ % ds

    df_prefer = spark.sql(sql)
    prefer_dict = dict()
    for row in df_prefer.collect():
        prefer_dict[row.devoruid] = row.prefer_cid

    path = "/user/yarn/swt/sample/%s" % f

    sample = spark.sparkContext.textFile(path)

    lines = sample.collect()
    sellcity = craftsman()
    for line in lines:
        ln = line.replace("(", "")
        ln2 = ln.replace(")", "")
        lnn = ln2.split(",")
        features = dict()
        _l = list()
        prefer_tag = []

        if lnn == "null" or lnn == None:
            continue

        for idx, item in enumerate(lnn):
            ft = dict()
            if idx == 0:
                prefer_cid = prefer_dict.get(item, list())

            elif idx == 1:
                ft2 = dict()
                sale = sale_dict.get(int(item), 0)
                level = getsale(sale)
                ft2["name"] = "sale"
                ft2["term"] = str(level)
                ft2["value"] = 1.0
                _l.append(ft2)

            elif idx == 2:
                features["label"] = int(item)
            else:
                values = item.split("=")
                name = values[0]
                if name in [
                        "collect2", "add2", "show2", "click2", "city_city",
                        "province"
                ]:
                    continue
                term = values[1].split(":")[0]
                value = values[1].split(":")[1]
                if name.find("tag_ptag") != -1:
                    ptag = term.split("_")[1]
                    prefer_tag.append(ptag)
                    continue
                if name == "collect1":
                    new_term = getcollect(float(term))
                elif name == "add1":
                    new_term = getaddcart(float(term))
                elif name == "cid":
                    cid = int(term)
                    new_term = term
                elif name == "tag":
                    tag = term
                    new_term = term
                elif name.find("city") != -1:

                    city = term.split("_")[1]
                    new_city = sellcity.get(city, "")
                    new_term = term.split("_")[0] + "_" + new_city

                elif name == "show1":
                    exposure = float(term)
                    continue
                elif name == "click1":
                    click = float(term)
                    continue
                else:
                    new_term = term
                ft["name"] = name
                ft["term"] = str(new_term)
                ft["value"] = float(value)
                _l.append(ft)
        ctr = (click + 5) / (exposure + 50)
        ctr_level = getctr(ctr)
        ctr_dict = dict()
        ctr_dict["name"] = "ctr"
        ctr_dict["value"] = 1.0
        ctr_dict["term"] = str(ctr_level)
        _l.append(ctr_dict)

        if cid in prefer_cid:
            ifcid = "1"
        else:
            ifcid = "0"

        if tag in prefer_tag:
            iftag = "1"
        else:
            iftag = "0"

        prefer_cid_dict = dict()
        prefer_cid_dict["name"] = "ifcid"
        prefer_cid_dict["term"] = ifcid
        prefer_cid_dict["value"] = 1.0
        _l.append(prefer_cid_dict)

        prefer_tag_dict = dict()
        prefer_tag_dict["name"] = "iftag"
        prefer_tag_dict["term"] = iftag
        prefer_tag_dict["value"] = 1.0
        _l.append(prefer_tag_dict)

        ctr_sale_dict = dict()
        ctr_sale_dict["name"] = "ctr_sale"
        ctr_sale_dict["term"] = str(ctr_level) + "_" + str(level)
        ctr_sale_dict["value"] = 1.0
        _l.append(ctr_sale_dict)

        features["features"] = _l
        writer.append(features)
    writer.close()