Esempio n. 1
0
    def calculate_centroids(self):
        if os.path.exists(self.centroids_file):
            os.remove(self.centroids_file)

        f = open(self.corpus_file, 'r')
        objects = ijson.items(f, 'articles.item')
        i = 0
        idmap = {}
        cent_array = []
        for article in objects:
            abstract_text = article["abstractText"]
            abstract_id = article["pmid"]
            text = article["title"] + " " + abstract_text

            centroid = get_centroid_idf(text, self.emb, self.idf, self.stopwords, self.dim)

            cent_array.append(np.array(centroid, dtype=np.float32))

            idmap[i] = abstract_id
            i += 1
        final_cent_array = np.array(cent_array, dtype=np.float32).reshape((i, self.dim))
        print final_cent_array.shape
        np.save(centroids_file, final_cent_array)


        fout = open(self.idmap_file, 'wb')
        pickle.dump(idmap, fout)
        fout.close()
Esempio n. 2
0
    def calculate_centroids(self):
        if os.path.exists(self.centroids_file):
            os.remove(self.centroids_file)

        f = open(self.corpus_file, 'r')
        objects = ijson.items(f, 'articles.item')
        i = 0
        idmap = {}
        cent_array = []
        for article in objects:
            abstract_text = article["abstractText"]
            abstract_id = article["pmid"]
            text = article["title"] + " " + abstract_text

            centroid = get_centroid_idf(text, self.emb, self.idf,
                                        self.stopwords, self.dim)

            cent_array.append(np.array(centroid, dtype=np.float32))

            idmap[i] = abstract_id
            i += 1
        final_cent_array = np.array(cent_array, dtype=np.float32).reshape(
            (i, self.dim))
        print final_cent_array.shape
        np.save(centroids_file, final_cent_array)

        fout = open(self.idmap_file, 'wb')
        pickle.dump(idmap, fout)
        fout.close()
Esempio n. 3
0
def _lean_load_json_file(filepath):
    """Helper function to load json contents from a file using ijson."""
    LOG.debug("About to load %s." % filepath)

    fd = open(filepath, 'rb')

    gzipper = gzip.GzipFile(fileobj=fd)
    builds = ijson.items(gzipper, 'builds.item')
    ret = {'builds': []}
    try:
        # We are going to store only the information we need from builds-.js
        # and ignore the rest.
        ret['builds'] = [{
            'properties': {
                key: value
                for (key, value) in b["properties"].iteritems()
                if key in ('buildername', 'request_ids', 'revision',
                           'packageUrl', 'testPackagesUrl', 'testsUrl')
            },
            'request_ids': b['request_ids']
        } for b in builds]

    except IOError, e:
        LOG.warning(str(e))
        raise
Esempio n. 4
0
def _lean_load_json_file(filepath):
    """Helper function to load json contents from a file using ijson."""
    LOG.debug("About to load %s." % filepath)

    fd = open(filepath, 'rb')

    gzipper = gzip.GzipFile(fileobj=fd)
    builds = ijson.items(gzipper, 'builds.item')
    ret = {'builds': []}
    # We are going to store only the information we need from builds-.js
    # and drop the rest.
    ret['builds'] = [{
        "properties": {
            key: value
            for (key, value) in b["properties"].iteritems()
            if key in ('buildername', 'request_ids', 'revision', 'packageUrl',
                       'testPackagesUrl', 'testsUrl')
        },
        "request_ids": b["request_ids"]
    } for b in builds]

    fd.close()
    gzipper.close()

    return ret
Esempio n. 5
0
    def load_data(self):
        """
        method to lazy load music.json file in memory
        """
        f = open("data/music-sample.json", 'rb')
        data = ijson.items(f, "item")

        return data
Esempio n. 6
0
def parser(files,accuracy):
    datapoints = []
    x = 0

    for filename in files:
        with open(filename, 'rb') as f:
            print "Parsing %s" % (filename)
            if args.fast == True:
                jsondata = json.load(f)
                data = jsondata["locations"]

            else:
                objects = ijson.items(f, 'locations.item')
                data = (o for o in objects)

            for entry in data:
                try:
                    if entry["accuracy"] > accuracy:
                        lat = str(float(entry["latitudeE7"]) / 1e7)
                        long = str(float(entry["longitudeE7"]) / 1e7)
                        #lat = str(decimal.Decimal(entry["latitudeE7"]) * decimal.Decimal(0.0000001))
                        #long = str(decimal.Decimal(entry["longitudeE7"]) * decimal.Decimal(0.0000001))
                        location = (lat, long, 1)
                        if len(datapoints) > 2:  #make sure the list is long enough to bisect
                            lookuplocation = (bisect.bisect_left(datapoints, location)) #bisect based on current location from google
                            #print "length: %s, location: %s" % (len(datapoints), lookuplocation)

                            if lookuplocation + 1 < len(datapoints): #check to see if we are at the end of the data set, if so append
                                # print "listlat: %s, lat: %s" %(datapoints[lookuplocation+1][0], lat)
                                if datapoints[lookuplocation + 1][0] == lat: # if the value is not a duplicate insert it
                                    if datapoints[lookuplocation + 1][1] == long: # if the value is not a duplicate insert it
                                        datapoints[lookuplocation+1] = (lat, long, datapoints[lookuplocation+1][2]+1)
                                        # print "duplicate found: %s, %s" % (datapoints[lookuplocation+1], datapoints[lookuplocation+1][2])
                                    else:
                                        # print "long didnt match"
                                        datapoints.insert(lookuplocation, location)
                                else:
                                    # print "lat didnt match"
                                    datapoints.insert(lookuplocation, location)
                            else:  # if the data goes at the end just append it instead of inserting it
                                #print "length: %s, location: %s" % (len(datapoints), lookuplocation)
                                datapoints.append(location)
                        else:
                            datapoints.append(location)
                except KeyError:
                    #print "entry contained no accuracy information, excluding"
                    continue
    count, frequent = 0,[]
    for data in datapoints:
        if data[2] > count:
            frequent = data
            count = frequent[2]

    return datapoints, frequent
Esempio n. 7
0
    def handle(self, *args, **options):
        self.groups = list(Groupe.objects.values_list('nom', flat=True))

        for source in args:
            with open(args[0], 'r') as f:
                for item in ijson.items(f, 'item'):
                    if item['groupe'] in self.groups:
                        continue

                    groupe, created = Groupe.objects.get_or_create(
                        nom=item['groupe'])

                    self.groups.append(item['groupe'])
Esempio n. 8
0
File: create.py Progetto: tbdn/dbp
    def run(self):
        with open(self.file, "r") as f:
            packets = ijson.items(f, 'item')

            batch = []
            for packet in packets:
                batch.append(packet)

                if len(batch) == self.batch_size:
                    self.queue.put(batch)
                    batch = []

            if len(batch) > 0:
                self.queue.put(batch)
        self.finished = True
Esempio n. 9
0
    def initialize(self):
        f = open(self.ret_file, 'r')
        data_q = json.load(f)
        abstracts_needed = set()
        for i in range(len(data_q["questions"])):
            abstracts_needed = abstracts_needed | set(data_q["questions"][i]["retrieved"])
        f.close()

        print "Collecting Abstracts.."
        f = open(self.corpus_file, 'r')
        corpus = ijson.items(f, 'articles.item')
        for article in corpus:
            pmid = article["pmid"]
            if pmid in abstracts_needed:
                self.corpus_index[pmid] = article["title"] + ' ' + article["abstractText"]
                abstracts_needed.remove(pmid)
                if not abstracts_needed:
                    break
        f.close()


        print len(self.corpus_index)
        q_array_q = []
        q_array_d = []
        q_array_max = []
        print "Reranking.."
        n_questions = len(data_q["questions"])
        for i in range(n_questions):
            #print i
            progress(i+1, n_questions, 'questions')
            q_id = data_q["questions"][i]["id"]
            q_body = data_q["questions"][i]["body"]
            q_retrieved = data_q["questions"][i]["retrieved"]

            retr_array_q, retr_array_d, retr_array_max = self.rerank(q_body, q_retrieved)

            q_array_q.append(Question(q_body, q_id, [x[0] for x in retr_array_q], [x[1] for x in retr_array_q]))
            q_array_d.append(Question(q_body, q_id, [x[0] for x in retr_array_d], [x[1] for x in retr_array_d]))
            q_array_max.append(Question(q_body, q_id, [x[0] for x in retr_array_max], [x[1] for x in retr_array_max]))

        with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_Q.json', 'w+') as outfile:
            outfile.write(json.dumps({"questions":[ob.__dict__ for ob in q_array_q]}, indent=2))

        with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_D.json', 'w+') as outfile:
            outfile.write(json.dumps({"questions":[ob.__dict__ for ob in q_array_d]}, indent=2))

        with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_MAX.json', 'w+') as outfile:
            outfile.write(json.dumps({"questions":[ob.__dict__ for ob in q_array_max]}, indent=2))
Esempio n. 10
0
def load_json_from_file(file_path):
    """Loads JSON data from the file.

    By passing ``file_path`` parameter, the file is opened
    and the data from the file is extracted.

    :param str file_path: Optional file path.

    :returns: JSON data.
    :rtype: `dict`
    """
    with open(file_path, 'rb') as file:
        items_generator = ijson.items(file, '')
        list_items = [item for item in items_generator]
        json_dict = list_items[0]

        return json_dict
Esempio n. 11
0
def extract_jD( inputFile ):
    print('Loading data file...')

    f = open(inputFile , 'rb')

    valid = []
    #This part appends sorrounding evidences
    done = 0
    ignored = 0

    allJavaDocSeqs = []
    for program in ijson.items(f, 'programs.item'):

        javadoc = program['javaDoc']

        if (javadoc is None) or (len(javadoc) == 0):
            ignored += 1
        else:

            javadoc_list = javadoc.strip().split()
            # replace all non alphabetical char into underscore
            javadoc_list = [re.sub("[^a-zA-Z]", '_', w) for w in javadoc_list]

            # break the terms using underscores
            tmp_list = []
            for t in javadoc_list:
               s = re.split("_+", t)
               tmp_list.extend(s)

            result_list = []
            for x in tmp_list:
                if len(x) > 1:
                    x = LEMMATIZER.lemmatize(x)
                    result_list.extend(camelCaseSplit(x))

            done += 1
            allJavaDocSeqs.append(result_list)
        print("Done with this many programs :: " + str(done + ignored), end='\r' )

    f.close()

    print("Number of programs with javadoc :: " + str(done) )
    print("Number of programs without javadoc :: " + str(ignored))
    
    return allJavaDocSeqs
Esempio n. 12
0
def validate_catalog_datasets(agency_id, schema='DATASET_1.0'):
    agency = audit = tasks = resp = None
    with transaction.atomic():
        try:
            # Get agency
            agency = Agency.objects.get(id=agency_id)

        except Agency.DoesNotExist as e:
            logger.exception(e)
            raise e

    # Get schema info (schema path, dataset_prefix)
    schema_info = JSON_SCHEMAS.get(schema, None)

    with transaction.atomic():
        audit = Audit.objects.create(agency_id=agency_id,
                                     audit_type=Audit.DATA_CATALOG_VALIDATION)

    try:
        with closing(open_streaming_response('GET',
                                             agency.data_json_url)) as resp:
            # Use the schema dataset_prefix to get an iterator for the items to be validated.
            objects = ijson.items(resp.raw,
                                  schema_info.get('dataset_prefix', ''))

            default_args = {
                'json_schema_name': schema,
                'source_url': agency.data_json_url
            }
            if audit:
                default_args.update({'audit_id': audit.id})

            # We're going to spin off async tas
            tasks = []
            for num, obj in enumerate(objects):
                args = default_args.copy()
                args.update({'json_object': obj, 'object_position': num})
                task = validate_json_object.apply_async(
                    args=(args, ), countdown=(num % COUNTDOWN_MODULO))
                tasks.append(task)

    except Exception as e:
        logger.exception(e)

    return tasks
Esempio n. 13
0
    def handle(self, *args, **options):
        self.insert = options['insert']

        if not self.insert and Vote.objects.count() == 0:
            print '--insert not specified and no vote in db, forcing --insert'
            self.insert = True

        self.deputes = {}
        for depute in Depute.objects.values_list('nom', 'prenom', 'pk'):
            self.deputes[depute[0]+depute[1]] = depute[2]

        self.scrutins = {}
        for scrutin in Scrutin.objects.values_list('uri', 'pk'):
            self.scrutins[scrutin[0]] = scrutin[1]

        self.groupes = {}
        for groupe in Groupe.objects.values_list('pk', 'nom'):
            self.groupes[groupe[1]] = groupe[0]

        print 'loaded cache'

        votes = []

        for source in args:
            with open(source, 'r') as f:
                print 'opening json'
                items = ijson.items(f, 'item')
                print 'loaded json'

                for item in items:
                    try:
                        votes.append(self.get_vote(item))
                    except KeyError:
                        pass

                    if not self.insert:
                        continue

                    if len(votes) > 20000:
                        Vote.objects.bulk_create(votes)
                        votes = []
                        print 'flushing'

        if self.insert:
            Vote.objects.bulk_create(votes)
Esempio n. 14
0
def _lean_load_json_file(filepath):
    """Helper function to load json contents from a file using ijson."""
    LOG.debug("About to load %s." % filepath)

    fd = open(filepath, 'rb')

    gzipper = gzip.GzipFile(fileobj=fd)
    builds = ijson.items(gzipper, 'builds.item')
    ret = {'builds': []}
    # We are going to store only the information we need from builds-.js
    # and drop the rest.
    ret['builds'] = [
        {"properties": {key: value for (key, value) in b["properties"].iteritems() if key in
                        ('buildername', 'request_ids', 'revision', 'packageUrl',
                            'testPackagesUrl', 'testsUrl')},
         "request_ids": b["request_ids"]}
        for b in builds]

    fd.close()
    gzipper.close()

    return ret
Esempio n. 15
0
def _lean_load_json_file(filepath):
    """Helper function to load json contents from a file using ijson."""
    LOG.debug("About to load %s." % filepath)

    fd = open(filepath, 'rb')

    gzipper = gzip.GzipFile(fileobj=fd)
    builds = ijson.items(gzipper, 'builds.item')
    ret = {'builds': []}
    try:
        # We are going to store only the information we need from builds-.js
        # and ignore the rest.
        ret['builds'] = [{
            'properties': {
                key: value for (key, value) in b["properties"].iteritems()
                if key in ('buildername', 'request_ids', 'revision', 'packageUrl',
                           'testPackagesUrl', 'testsUrl')
            },
            'request_ids': b['request_ids']
        } for b in builds]

    except IOError, e:
        LOG.warning(str(e))
        raise
Esempio n. 16
0
def _json_parse(stdout):
    try:
        results = list(ijson.items(stdout, ''))[0]
        return results
    except ijson.backends.python.UnexpectedSymbol:
        return []
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('location_file',
                        help='json extraction of your location')
    parser.add_argument(
        'home_coord',
        help='coordinate of home locations. (list of comma separated lat/lon)')
    parser.add_argument(
        'work_coord',
        help='coordinate of work locations. (list of comma separated lat/lon)')
    parser.add_argument(
        '--start_date',
        help='only parse data since this date. Format is YYYY/MM/DD')
    parser.add_argument('--tolerance',
                        default=0.5,
                        type=float,
                        help='default tolerance when detection location')

    args = parser.parse_args()
    start_date = args.start_date
    if start_date:
        try:
            start_date = datetime.strptime(start_date, '%Y/%m/%d')
        except ValueError:
            print('start date "{}" doesn\'t match the expected format: '
                  'YYYY/MM/DD'.format(start_date))

    start_date_ms = int(start_date.timestamp() * 1000) if start_date else 0

    # Lat, Lon
    home = [[float(c) for c in l.split(':')]
            for l in args.home_coord.split(',')]
    work = [[float(c) for c in l.split(':')]
            for l in args.work_coord.split(',')]

    distance_helper = DistanceHelper(home, work, args.tolerance)

    print('Distance between work and home: {}'.format(
        distance_helper.distance(home[0], work[0])))

    days_at_work = {}
    days_at_home = {}
    with open(args.location_file, 'rb') as loc:
        records = ijson.items(loc, 'locations.item')
        n = 0
        for rec in records:
            n += 1
            if n % 10000 == 0:
                print('At {}'.format(n))

            timestamp = int(rec['timestampMs'])
            if timestamp < start_date_ms:
                break

            location = (rec['latitudeE7'] / 10000000.,
                        rec['longitudeE7'] / 10000000)
            timestamp = timestamp / 1000.
            d = datetime.fromtimestamp(timestamp)
            date_tuple = (d.year, d.month, d.day)

            if distance_helper.at_work(location):
                day = days_at_work.setdefault(date_tuple,
                                              DaySomewhere(MAX_DATE, MIN_DATE))
                if day.arrived.timestamp() > timestamp:
                    day.arrived = d
                if day.left.timestamp() < timestamp:
                    day.left = d
            elif distance_helper.at_home(location):
                day = days_at_home.setdefault(date_tuple,
                                              DaySomewhere(MAX_DATE, MIN_DATE))
                if day.arrived.timestamp() > timestamp and d.hour > 13:
                    day.arrived = d
                elif day.left.timestamp() < timestamp and d.hour < 13:
                    day.left = d

        print('Processed {} records'.format(n))
        print('worked for {} days'.format(len(days_at_work)))
        print('was home for {} days'.format(len(days_at_home)))

        # Print the work day length.
        worked_seconds = []
        for k in sorted(days_at_work.keys()):
            arrived = days_at_work[k].arrived
            left = days_at_work[k].left
            t = left - arrived
            worked_hours = int(t.total_seconds() / 3600)
            worked_minutes = int((t.total_seconds() % 3600) / 60)
            if worked_hours > 2:
                worked_seconds.append(t.total_seconds())

            print('{}: Worked {} hours and {} minutes, from {} to {}'.format(
                arrived.strftime('%a %b %d'), worked_hours, worked_minutes,
                arrived.strftime('%H:%M'), left.strftime('%H:%M')))

        average_seconds = sum(worked_seconds) / len(worked_seconds)
        print('On average: {} hours and {} minutes'.format(
            int(average_seconds / 3600), int((average_seconds % 3600) / 60)))

        # Print the commute length.
        print('\n\n\nCommute lengths:')
        commute_seconds = []
        for k in sorted(days_at_work.keys()):
            if not k in days_at_home:
                continue
            work = days_at_work[k]
            home = days_at_home[k]
            morning_commute = int(
                (work.arrived - home.left).total_seconds() / 60)
            evening_commute = int(
                (home.arrived - work.left).total_seconds() / 60)

            print('{}: Morning {}min, Evening {}min'.format(
                work.arrived.strftime('%a %b %d'), morning_commute,
                evening_commute))
Esempio n. 18
0
def _pkginfo_iterator():
    url = 'https://skimdb.npmjs.com/registry/_all_docs?include_docs=true'
    fh = urllib.urlopen(url)
    # temporary patch to use locally cached file
    # fh = open("../dataset/npm.cache/npm.json")
    return ijson.items(fh, 'rows.item')
Esempio n. 19
0
'''
Split apart test.json so I can run the damn thing on my computer.
Because apparently I don't have enough memory.
'''

import ijson.backends.yajl2 as ijson
import simplejson as json

jsonFile = open("data/test.json", "rb")

curSplit = 0
objBuffer = []

objs = ijson.items(jsonFile, "item")
for obj in objs:
    objBuffer.append(obj)
    if len(objBuffer) == 1000:
        with open("data/test." + str(curSplit) + ".json", "w") as outfile:
            json.dump(objBuffer, outfile, separators=(',', ':'))
        objBuffer = []
        curSplit += 1
        print("Wrote " + str(curSplit * 1000) + " records to disk")
# dump whatever is left
with open("data/test." + str(curSplit) + ".json", "w") as outfile:
    json.dump(objBuffer, outfile, separators=(',', ':'))
Esempio n. 20
0
 def parse(self, data):
     import ijson.backends.yajl2 as ijson
     bugs = ijson.items(data, 'result.bugs.item')
     return (self.service.item(service=self.service, bug=bug) for bug in bugs)
def _process_json(filename):
    if filename is None:
        return None

    cnt = 0
    selected = 0
    onlyArticles = True
    PO_NAME = 'wikidata.po'
    SAVE_INTERVAL = 1000
    PROCESS_NOF_ENTRIES = 2 * 1000 * 1000

    po_file = _create_empty_po_file()

    with open(filename, 'r') as json_data:
        value = ijson.items(json_data, 'item')

        for item in value:
            label = item.get('labels')
            if label is None:
                continue

            item_id = item['id']
            if onlyArticles is True:
                if item_id is None or item_id.startswith("Q") is False:
                    continue

            comment = u'Article {0}'.format(item_id)
            en_label = label.get('en')
            ca_label = label.get('ca')

            if en_label is None or ca_label is None:
                continue

            cnt = cnt + 1
            value = en_label['value']

            if _is_segment_valid(value) is False:
                continue

            exists = exists_in_tm(value)
            if exists > 0:
                selected = selected + 1
            else:
                continue

            entry = polib.POEntry(msgid=en_label['value'],
                                  msgstr=ca_label['value'],
                                  tcomment = comment + " (label)")

            _insert_entry_inpofile(po_file, entry)

            desc = item.get('descriptions')
            if desc is not None:
                en_desc = desc.get('en')
                ca_desc = desc.get('ca')

                if en_desc is not None and ca_desc is not None:
                    entry = polib.POEntry(msgid=en_desc['value'],
                                          msgstr=ca_desc['value'],
                                          tcomment = comment + " (description)")

                    _insert_entry_inpofile(po_file, entry)



            if cnt % SAVE_INTERVAL == 0:
                po_file.save(PO_NAME)

            if cnt > PROCESS_NOF_ENTRIES:
                break

    po_file.save(PO_NAME)
    print ("Total entries: " + str(cnt))
    print ("Selected: {0} (%{1})".format(str(selected), str(percentage(selected, cnt))))
Esempio n. 22
0
    def initialize(self):
        f = open(self.ret_file, 'r')
        data_q = json.load(f)
        abstracts_needed = set()
        for i in range(len(data_q["questions"])):
            abstracts_needed = abstracts_needed | set(
                data_q["questions"][i]["retrieved"])
        f.close()

        print "Collecting Abstracts.."
        f = open(self.corpus_file, 'r')
        corpus = ijson.items(f, 'articles.item')
        for article in corpus:
            pmid = article["pmid"]
            if pmid in abstracts_needed:
                self.corpus_index[
                    pmid] = article["title"] + ' ' + article["abstractText"]
                abstracts_needed.remove(pmid)
                if not abstracts_needed:
                    break
        f.close()

        print len(self.corpus_index)
        q_array_q = []
        q_array_d = []
        q_array_max = []
        print "Reranking.."
        n_questions = len(data_q["questions"])
        for i in range(n_questions):
            #print i
            progress(i + 1, n_questions, 'questions')
            q_id = data_q["questions"][i]["id"]
            q_body = data_q["questions"][i]["body"]
            q_retrieved = data_q["questions"][i]["retrieved"]

            retr_array_q, retr_array_d, retr_array_max = self.rerank(
                q_body, q_retrieved)

            q_array_q.append(
                Question(q_body, q_id, [x[0] for x in retr_array_q],
                         [x[1] for x in retr_array_q]))
            q_array_d.append(
                Question(q_body, q_id, [x[0] for x in retr_array_d],
                         [x[1] for x in retr_array_d]))
            q_array_max.append(
                Question(q_body, q_id, [x[0] for x in retr_array_max],
                         [x[1] for x in retr_array_max]))

        with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_Q.json',
                  'w+') as outfile:
            outfile.write(
                json.dumps({"questions": [ob.__dict__ for ob in q_array_q]},
                           indent=2))

        with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_D.json',
                  'w+') as outfile:
            outfile.write(
                json.dumps({"questions": [ob.__dict__ for ob in q_array_d]},
                           indent=2))

        with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_MAX.json',
                  'w+') as outfile:
            outfile.write(
                json.dumps({"questions": [ob.__dict__ for ob in q_array_max]},
                           indent=2))
Esempio n. 23
0
def _process_json(filename):
    if filename is None:
        return None

    cnt = 0
    selected = 0
    onlyArticles = True
    PO_NAME = 'wikidata.po'
    SAVE_INTERVAL = 1000
    PROCESS_NOF_ENTRIES = 2 * 1000 * 1000

    po_file = _create_empty_po_file()

    with open(filename, 'r') as json_data:
        value = ijson.items(json_data, 'item')

        for item in value:
            label = item.get('labels')
            if label is None:
                continue

            item_id = item['id']
            if onlyArticles is True:
                if item_id is None or item_id.startswith("Q") is False:
                    continue

            comment = u'Article {0}'.format(item_id)
            en_label = label.get('en')
            ca_label = label.get('ca')

            if en_label is None or ca_label is None:
                continue

            cnt = cnt + 1
            value = en_label['value']

            if _is_segment_valid(value) is False:
                continue

            exists = exists_in_tm(value)
            if exists > 0:
                selected = selected + 1
            else:
                continue

            entry = polib.POEntry(msgid=en_label['value'],
                                  msgstr=ca_label['value'],
                                  tcomment=comment + " (label)")

            _insert_entry_inpofile(po_file, entry)

            desc = item.get('descriptions')
            if desc is not None:
                en_desc = desc.get('en')
                ca_desc = desc.get('ca')

                if en_desc is not None and ca_desc is not None:
                    entry = polib.POEntry(msgid=en_desc['value'],
                                          msgstr=ca_desc['value'],
                                          tcomment=comment + " (description)")

                    _insert_entry_inpofile(po_file, entry)

            if cnt % SAVE_INTERVAL == 0:
                po_file.save(PO_NAME)

            if cnt > PROCESS_NOF_ENTRIES:
                break

    po_file.save(PO_NAME)
    print("Total entries: " + str(cnt))
    print("Selected: {0} (%{1})".format(str(selected),
                                        str(percentage(selected, cnt))))
Esempio n. 24
0
for rep in range(times):
    try:
        counter = 0
        now = time.time()
        with open(sys.argv[3]) as fileobj:
            if parser.startswith('jsaone'):
                if parser == 'jsaone_cyt':
                    from jsaone_cyt import load
                elif parser == 'jsaone_py':
                    from jsaone_py import load
                elif parser == 'jsaone':
                    from jsaone import load

                for obj in load(fileobj):
                    counter += 1

            elif parser == 'json':
                from json import load
                for item in load(fileobj):
                    counter += 1

            elif parser == 'ijson':
                import ijson.backends.yajl2 as ijson
                for obj in ijson.items(fileobj, ''):
                    counter += 1

            print("Read %d objects in %f seconds with parser %s" %
                  (counter, time.time() - now, parser))
    except Exception as exc:
        raise exc
Esempio n. 25
0
    with TestCase('ijson.yajl2_c', 'bytes', args.json_size, results):
        gen = io.BytesIO(jsondata.encode('utf-8'))
        parser = ijson_yajl2_c.items(gen, b'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)

    with TestCase('ijson.yajl2_cffi', 'bytes', args.json_size, results):
        gen = io.BytesIO(jsondata.encode('utf-8'))
        parser = ijson_yajl2_cffi.items(gen, b'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)

    with TestCase('ijson.yajl2', 'bytes', args.json_size, results):
        gen = io.BytesIO(jsondata.encode('utf-8'))
        parser = ijson_yajl2.items(gen, 'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)

    with TestCase('ijson.python', 'str', args.json_size, results):
        gen = io.StringIO(jsondata)
        parser = ijson_python.items(gen, 'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)

    print(
        tabulate(
            results,
            headers=['Facility', 'Type', 'Objects/sec'],
            stralign='right',
            # colalign=['left', 'center', 'right'],
Esempio n. 26
0
def parser(files, accuracy):
    datapoints = []
    x = 0

    for filename in files:
        with open(filename, 'rb') as f:
            print "Parsing %s" % (filename)
            if args.fast == True:
                jsondata = json.load(f)
                data = jsondata["locations"]

            else:
                objects = ijson.items(f, 'locations.item')
                data = (o for o in objects)

            for entry in data:
                try:
                    if entry["accuracy"] > accuracy:
                        lat = str(float(entry["latitudeE7"]) / 1e7)
                        long = str(float(entry["longitudeE7"]) / 1e7)
                        #lat = str(decimal.Decimal(entry["latitudeE7"]) * decimal.Decimal(0.0000001))
                        #long = str(decimal.Decimal(entry["longitudeE7"]) * decimal.Decimal(0.0000001))
                        location = (lat, long, 1)
                        if len(
                                datapoints
                        ) > 2:  #make sure the list is long enough to bisect
                            lookuplocation = (
                                bisect.bisect_left(datapoints, location)
                            )  #bisect based on current location from google
                            #print "length: %s, location: %s" % (len(datapoints), lookuplocation)

                            if lookuplocation + 1 < len(
                                    datapoints
                            ):  #check to see if we are at the end of the data set, if so append
                                # print "listlat: %s, lat: %s" %(datapoints[lookuplocation+1][0], lat)
                                if datapoints[lookuplocation + 1][
                                        0] == lat:  # if the value is not a duplicate insert it
                                    if datapoints[lookuplocation + 1][
                                            1] == long:  # if the value is not a duplicate insert it
                                        datapoints[lookuplocation + 1] = (
                                            lat, long,
                                            datapoints[lookuplocation + 1][2] +
                                            1)
                                        # print "duplicate found: %s, %s" % (datapoints[lookuplocation+1], datapoints[lookuplocation+1][2])
                                    else:
                                        # print "long didnt match"
                                        datapoints.insert(
                                            lookuplocation, location)
                                else:
                                    # print "lat didnt match"
                                    datapoints.insert(lookuplocation, location)
                            else:  # if the data goes at the end just append it instead of inserting it
                                #print "length: %s, location: %s" % (len(datapoints), lookuplocation)
                                datapoints.append(location)
                        else:
                            datapoints.append(location)
                except KeyError:
                    #print "entry contained no accuracy information, excluding"
                    continue
    count, frequent = 0, []
    for data in datapoints:
        if data[2] > count:
            frequent = data
            count = frequent[2]

    return datapoints, frequent
Esempio n. 27
0
def load_ijson(filepath, itempath):
    return ijson.items(open(filepath, "rb"), itempath)