Example #1
0
def test__json_next_signature():

    name = 'Foo Bar'
    filename = '/tmp/foobar'

    minhash = (2,3,4,5,6)
    t = OrderedDict((('ksize', 21),
                     ('num', len(minhash)),
                     #('md5sum', ),
                     ('cardinality', 123456),
                     ('mins', minhash)))
    s = json.dumps(t)
    if sys.version_info[0] < 3:
        s = unicode(s)
    it = ijson.parse(io.StringIO(s))
    # no MD5SUM
    sig = _json_next_signature(it, name, filename,
                               ignore_md5sum=True,
                               ijson=ijson)

    ## check MD5SUM
    minhash = (5,)
    t = OrderedDict((('ksize', 20),
                     ('num', len(minhash)),
                     ('md5sum', 'eae27d77ca20db309e056e3d2dcd7d69'),
                     ('cardinality', 123456),
                     ('mins', minhash)))
    s = json.dumps(t)
    if sys.version_info[0] < 3:
        s = unicode(s)
    it = ijson.parse(io.StringIO(s))
    sig = _json_next_signature(it, name, filename,
                               ignore_md5sum=False,
                               ijson=ijson)
 def get_metadata(self, fname):
     """
     If the file is not too large, return its metadata.
     """
     if os.stat(fname).st_size > self.filesize_limit > 0:
         return
     if fname == self.lastFileName and self.lastDocMeta is not None:
         self.insert_meta_year(self.lastDocMeta)
         return self.lastDocMeta
     self.lastFileName = fname
     if self.format == 'json':
         fIn = open(fname, 'r', encoding='utf-8-sig')
     elif self.format == 'json-gzip':
         fIn = gzip.open(fname, 'rt', encoding='utf-8-sig')
     else:
         return {}
     metadata = {}
     curMetaField = ''
     JSONParser = ijson.parse(fIn)
     for prefix, event, value in JSONParser:
         if (prefix, event) == ('meta', 'map_key'):
             curMetaField = value
         elif len(curMetaField) > 0 and prefix.startswith('meta.'):
             metadata[curMetaField] = value
         elif (prefix, event) == ('meta', 'end_map'):
             break
     self.lastDocMeta = metadata
     fIn.close()
     self.insert_meta_year(metadata)
     return metadata
Example #3
0
def geojson_converter(inputfile, count):
    """
    The main entry point
    """
    if inputfile is None:
        raise Exception("Missing ")

    max_count = count
    current_count = 0
    parser = ijson.parse(inputfile)

    f = GeojsonFsm()

    for prefix, event, value in parser:
        try:
            f.submitEvent(prefix, event, value)

        except EndOfFile:
            sys.exit(0)  # no error
            break

        except Exception as e:
            logging.error(e)
            sys.exit(1)

        if count != None:
            current_count += 1
            if current_count == max_count:
                break
def parse_location(stream, filter):
    """
        Given a stream and a filter, parse JSON data that fits filter to GeoJSON file
    """

    parser = ijson.parse(stream)
    reading = False
    obj = {}
    key = None
    value = None
    for prefix, event, value in parser:
        #print "prefix: " + str(prefix)
        if prefix == 'locations' and event == 'start_array':
            reading = True
        elif prefix == 'locations' and event == 'end_array':
            reading = False
        elif reading:
            if event == 'start_map' and prefix == 'locations.item':
                obj = {}
                activities = {}
            elif event == 'end_map' and prefix == 'locations.item':
                obj['activities'] = activities
                yield create_feature(obj, filter)
            elif event == 'map_key':
                key = value
            elif prefix == 'locations.item.%s' % key and value is not None:
                obj[key] = value
            elif prefix == 'locations.item.activitys.item.activities.item.type':
                activity = value
            elif prefix == 'locations.item.activitys.item.activities.item.confidence':
                confidence = value
            elif prefix == 'locations.item.activitys.item.activities.item' and  event == 'end_map':
                activities[activity] = confidence
def main():

    # Establish communication queues
    tasks = multiprocessing.JoinableQueue(16)
    results = multiprocessing.Queue()

    # Start consumers
    num_consumers = multiprocessing.cpu_count() * 2
    # num_consumers = 1
    print('Creating %d consumers' % num_consumers)
    consumers = [ Consumer(tasks, results)
                  for i in xrange(num_consumers) ]
    for w in consumers:
        w.start()

    r = requests.post(NEO4J_CREATE_TRAN_URL, headers=generateheaders(),data=json.dumps(GET_PDB_IDS_NEO4J_STATEMENT),
                      stream=True)
    parser = ijson.parse(r.raw)
    i = 0
    pdb_id_list = []
    for prefix, event, value in parser:
        if (prefix, event) == ('results.item.data.item.row.item', 'string'):
            # process_pdb_id(value)
            tasks.put(MapPDBOrganism(value), True, None)

        # Add a poison pill for each consumer
    for i in xrange(num_consumers):
        tasks.put(None)

    # Wait for all of the tasks to finish
    tasks.join()
def is_geojson(path):
    with open(path) as fp:
        try:
            parser = ijson.parse(fp)
            data = {}
            top_keys = ['coordinates', 'features', 'geometry', 'geometries']
            for prefix, event, value in parser:
                if (prefix, event) == ('type', 'string'):
                    data['type'] = value
                elif (prefix, event) == ('', 'map_key') and value in top_keys:
                    data[value] = True
            gtype = data.get('type')
            geo_types = [
                'LineString', 'MultiLineString', 'MultiPoint', 'MultiPolygon',
                'Point', 'Polygon'
            ]
            return any([
                gtype in geo_types and 'coordinates' in data,
                gtype == 'Feature' and 'geometry' in data,
                gtype == 'FeatureCollection' and 'features' in data,
                gtype == 'GeometryCollection' and 'geometries' in data,
            ])
        except Exception as exc:
            logger.debug('Exception during geojson validation: {}'.format(exc))
    return False
Example #7
0
def stream_geojson(stream):
    '''
    '''
    data = ijson.parse(stream)

    for (prefix1, event1, value1) in data:
        if event1 != 'start_map':
            # A root GeoJSON object is a map.
            raise ValueError((prefix1, event1, value1))

        for (prefix2, event2, value2) in data:
            if event2 == 'map_key' and value2 == 'type':
                prefix3, event3, value3 = next(data)

                if event3 != 'string' and value3 != 'FeatureCollection':
                    # We only want GeoJSON feature collections
                    raise ValueError((prefix3, event3, value3))

            elif event2 == 'map_key' and value2 == 'features':
                prefix4, event4, value4 = next(data)

                if event4 != 'start_array':
                    # We only want lists of features here.
                    raise ValueError((prefix4, event4, value4))

                for (prefix5, event5, value5) in data:
                    if event5 == 'end_array':
                        break

                    # let _build_value() handle the feature.
                    _data = chain([(prefix5, event5, value5)], data)
                    feature = _build_value(_data)
                    yield feature
Example #8
0
def stream_geojson(stream):
    '''
    '''
    data = ijson.parse(stream)

    for (prefix1, event1, value1) in data:
        if event1 != 'start_map':
            # A root GeoJSON object is a map.
            raise ValueError((prefix1, event1, value1))

        for (prefix2, event2, value2) in data:
            if event2 == 'map_key' and value2 == 'type':
                prefix3, event3, value3 = next(data)

                if event3 != 'string' and value3 != 'FeatureCollection':
                    # We only want GeoJSON feature collections
                    raise ValueError((prefix3, event3, value3))

            elif event2 == 'map_key' and value2 == 'features':
                prefix4, event4, value4 = next(data)

                if event4 != 'start_array':
                    # We only want lists of features here.
                    raise ValueError((prefix4, event4, value4))

                for (prefix5, event5, value5) in data:
                    if event5 == 'end_array':
                        break

                    # let _build_value() handle the feature.
                    _data = chain([(prefix5, event5, value5)], data)
                    feature = _build_value(_data)
                    yield feature
Example #9
0
 def extract_threatname(self, report):
     parser = ijson.parse(report)
     for prefix, event, value in parser:
         if prefix == "analysis.signaturedetections.strategy.item.threatname" \
             and value is not None and str(value).lower() != "unknown":
             self.add_probable_name(str(value))
             self.add_tag(str(value).lower())
Example #10
0
    def _detect_fields_in_geojson(self, resource_dict):
        geo_columns_dict = {}
        try:
            upload = uploader.ResourceUpload(resource_dict)
            with io.open(upload.get_path(resource_dict['id']),
                         'rb') as f, io.TextIOWrapper(
                             f, encoding='utf-8-sig') as tf:
                parser = ijson.parse(tf)

                geo_columns = set()
                i = 0
                for prefix, event, value in parser:
                    if prefix == u'features.item.properties' and event == u'map_key':
                        geo_columns.add(value)
                        i += 1

                    if i > 10:
                        break
                    pass
                geo_columns_dict = [{
                    'value': item,
                    'text': item
                } for item in sorted(geo_columns)]
        except Exception as e:
            log.warn(
                u'Error accessing resource size for resource {}: {}'.format(
                    resource_dict.get('name', ''), str(e)))
            geo_columns_dict = {}
        return geo_columns_dict
Example #11
0
 def test_parse(self):
     events = parse(StringIO(JSON))
     events = [value
         for prefix, event, value in events
         if prefix == 'docs.item.meta.item.item'
     ]
     self.assertEqual(events, [1])
    def extract_info(self, report):
        # First, build an array with every antivirus information that might be
        # of interrest
        av_prefixes = []
        for av in self._analysis._file['antivirus']:
            av_prefixes.append('data.signatures.item.data.item.{}'.format(av))

        parser = ijson.parse(report)
        self.results['signatures'] = []
        signature = dict()

        for prefix, event, value in parser:
            if prefix == "data.signatures.item" and event == "end_map":
                self.results['signatures'].append(signature)
                signature = dict()
            elif prefix == "data.signatures.item.name":
                signature['name'] = value
                self.add_tag(value)
            elif prefix == "data.signatures.item.severity":
                signature['severity'] = value
            elif prefix == "data.signatures.item.description":
                signature['description'] = value
            elif ('name' in signature
                  and signature['name'] == 'antivirus_virustotal'
                  and prefix in av_prefixes):
                self._analysis._file.update_value(['antivirus', prefix.split('.')[-1]], value)
            elif prefix == "data.malfamily":
                self.results['classification'] = value
            elif prefix == "data.malscore":
                self.results['score'] = str(value)
            elif prefix in ["data.network.domains.item.domain", "data.network.hosts.item.ip", "data.network.traffic.http.item.uri"]:
                self.add_ioc(value)
def read_regions():
    VG_VERSION = '1.2'
    VG_PATH = '/home/joe/git/VG_raw_data'
    VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION)
    # parser = ijson.parse(open('test_region.json'))
    parser = ijson.parse(open(VG_REGION_PATH))

    last_value = None
    Dic = {}
    regions = []
    dic = {}
    for prefix, event, value in parser:
        if value == 'regions':
            Dic = {}
            regions = []
            last_value = None
        elif last_value == 'id':
            Dic['regions'] = regions
            Dic['id'] = value
            with open('test_id_%s.json' % value, 'w') as f:
                json.dump(Dic, f)
                break
        elif event == 'map_key':
            last_value = value
        elif event == 'end_map':
            regions.append(dic)
            dic = {}
            last_value = None
        elif last_value:
            dic[last_value] = value
Example #14
0
def test_load_signature_json():
    email = '*****@*****.**'
    name = 'Foo Bar'
    filename = '/tmp/foobar'

    minhash = (2, 3, 4, 5, 6)
    t = OrderedDict((
        ('email', email),
        ('name', name),
        ('filename', filename),
        (
            'signatures',
            (
                OrderedDict((
                    ('ksize', 21),
                    ('num', len(minhash)),
                    #('md5sum', ),
                    ('cardinality', 123456),
                    ('mins', minhash))), ))))
    s = json.dumps(t)
    if sys.version_info[0] < 3:
        s = unicode(s)
    it = ijson.parse(io.StringIO(s))
    # no MD5SUM
    sig_entry = load_signature_json(it, ignore_md5sum=True)
    def implement_tags(self):
        """Applies the ingredient tags to the Recipe1M+ recipe corpus."""
        tags_dict = instance.tags_cleaner(
        )  # Import the {ingredient: tags} dictionary
        recipe = ijson.parse(
            open("layer1.json"))  # Import the corpus to python chunk by chunk

        for prefix, event, value in recipe:  # Look into the recipe currently in memory
            if prefix == "item.ingredients.item.text":  # Grab a recipe instruction
                tokenized = nltk.word_tokenize(
                    value)  # Tokenize it -> [word1, word2, word3]
                dracula = 0
                new_string = ""
                for word in tokenized:  # For each word in the list
                    if dracula > 1:  # Avoid infinte loop (see below)
                        continue  # Avoid infinte loop (see below)
                    elif word in tags_dict.keys(
                    ):  # If the word is in the tag dictionary
                        for i in range(len(tokenized)):  # Find index of word
                            if tokenized[i] == word:  # Find index of word
                                tokenized.insert(
                                    i + 1, tags_dict[word]
                                )  # Insert the associated tag behind the word
                                new_string = " ".join(
                                    tokenized)  # Merge the list into a string
                                dracula += 1  # Avoid infinte loop created by this for loop
                print(new_string)  # Print each instructions with tags.
Example #16
0
 def dump_candidate_dict_to_db():
     with open(constant.CANDIDATE_DICT_JSON_PATH, 'r') as fd:
         parser = ijson.parse(fd)
         candidate = Candidate()
         for prefix, event, value in parser:
             print(prefix, event, value)
             if (prefix, event) == ("", "map_key"):
                 if candidate is not None:
                     candidate.save()
                 candidate = Candidate()
                 candidate.word = value
                 candidate.left_set = {}
                 candidate.right_set = {}
             elif event == "map_key" and prefix.endswith("left_set"):
                 key = value
                 left_temp_dict = {key: None}
             elif event == "number" and prefix.endswith(
                     "left_set.%s" % key):
                 left_temp_dict[key] = str(value)
                 candidate.left_set.update(left_temp_dict)
             elif event == "map_key" and prefix.endswith("right_set"):
                 key = value
                 right_temp_dict = {key: None}
             elif event == "number" and prefix.endswith(
                     "right_set.%s" % key):
                 right_temp_dict[key] = str(value)
                 candidate.right_set.update(right_temp_dict)
             elif event == "number" and prefix.endswith("count"):
                 candidate.count = value
Example #17
0
    def _parse_response(self):
        """Looks for `result.item` (array), `result` (object) and `error` (object) keys and parses
        the raw response content (stream of bytes)

        :raise:
            - ResponseError: If there's an error in the response
            - MissingResult: If no result nor error was found
        """

        response = self._get_response()

        has_result_single = False
        has_result_many = False
        has_error = False

        builder = ObjectBuilder()

        for prefix, event, value in ijson.parse(response.raw, buf_size=self._chunk_size):
            if (prefix, event) == ('error', 'start_map'):
                # Matched ServiceNow `error` object at the root
                has_error = True
            elif prefix == 'result' and event in ['start_map', 'start_array']:
                # Matched ServiceNow `result`
                if event == 'start_map':  # Matched object
                    has_result_single = True
                elif event == 'start_array':  # Matched array
                    has_result_many = True

            if has_result_many:
                # Build the result
                if (prefix, event) == ('result.item', 'end_map'):
                    # Reached end of object. Set count and yield
                    builder.event(event, value)
                    self.count += 1
                    yield getattr(builder, 'value')
                elif prefix.startswith('result.item'):
                    # Build the result object
                    builder.event(event, value)
            elif has_result_single:
                if (prefix, event) == ('result', 'end_map'):
                    # Reached end of the result object. Set count and yield.
                    builder.event(event, value)
                    self.count += 1
                    yield getattr(builder, 'value')
                elif prefix.startswith('result'):
                    # Build the error object
                    builder.event(event, value)
            elif has_error:
                if (prefix, event) == ('error', 'end_map'):
                    # Reached end of the error object - raise ResponseError exception
                    raise ResponseError(getattr(builder, 'value'))
                elif prefix.startswith('error'):
                    # Build the error object
                    builder.event(event, value)

        if (has_result_single or has_result_many) and self.count == 0:  # Results empty
            return

        if not (has_result_single or has_result_many or has_error):  # None of the expected keys were found
            raise MissingResult('The expected `result` key was missing in the response. Cannot continue')
Example #18
0
def load_signatureset_json_iter(data,
                                ksize=None,
                                ignore_md5sum=False,
                                ijson=ijson):
    """
    - data: file handle (or file handle-like) object
    - ksize:
    - ignore_md5sum:
    - ijson: ijson backend
    """

    parser = ijson.parse(data)

    prefix, event, value = next(parser)
    assert prefix == '' and event == 'start_array' and value is None

    siglist = []
    n = 0
    while True:
        try:
            sig = load_signature_json(
                parser,
                prefix_item='item.signatures.item.mins.item',
                ignore_md5sum=ignore_md5sum,
                ijson=ijson)
            if not ksize or ksize == sig.minhash.ksize:
                yield sig
        except ValueError:
            # possible end of the array of signatures
            prefix, event, value = next(parser)
            assert event == 'end_array'
            break
        n += 1
def dump(path):  # pragma: no cover
    """prints all the data ijson finds in a file in ijson event form; Not Recommended
    for large files"""
    with PrefixedJSON(path) as json_file:
        for prefix, event, value in ijson.parse(json_file):
            print("prefix=" + prefix + ", event=" + event + ", value=" +
                  str(value))
Example #20
0
    def _validate_(self, level):
        # doi.org/10.1371/journal.pone.0031009
        keys_found = set()

        # Can't self.open(mode='rb'), so we defer to the backing pathlib object
        with self.path.open(mode='rb') as fh:
            root_element = None
            for prefix, event, value in ijson.parse(fh):
                if root_element is None:
                    if event != 'start_map':
                        raise ValidationError('Root element of file must be a '
                                              'JSON object')
                    else:
                        root_element = True

                # Skip parsing attributes that could be prohibitively large
                if prefix.startswith('placements') \
                        or prefix.startswith('tree'):
                    continue

                # Restricted to only checking root-level keys
                if event == 'map_key' and prefix == '':
                    keys_found.add(value)

        if keys_found != self.fields:
            raise ValidationError('Expected the following fields: %s, found '
                                  '%s.' %
                                  (sorted(self.fields), sorted(keys_found)))
def prefix_finder(path):  # pragma: no cover
    "returns all the prefixes ijson finds in a file; used for parser development"
    with PrefixedJSON(path) as json_file:
        prefixes = set()
        for p, _, _ in ijson.parse(json_file):
            prefixes.add(p)
    return prefixes
Example #22
0
def jsonToTxt(json_filename, goal_path):
    path = goal_path  # this will be the path where the code puts the txts
    dont_allow = [
        "reviewerID", "asin", "reviewerName", "helpful", "reviewText",
        "overall", "summary", "reviewTime", "unixReviewTime"
    ]
    with open(json_filename, encoding="UTF-8") as json_file:
        count = 0  # counts the txt generated
        bandera = False
        for line_number, line in enumerate(json_file):
            if count >= 10481487:
                line_as_file = io.StringIO(line)
                # Use a new parser for each line
                json_parser = ijson.parse(line_as_file)
                filebody = " "
                for prefix, type, value in json_parser:  # each item from the line, only we need the value
                    if value is None:
                        pass
                    else:
                        if value not in dont_allow:  # don't allow trash
                            filebody += str(value)  # txt body
                            filebody += " "
                path += str(count)
                path += ".txt"
                file = open(path, "w")
                file.write(filebody)
                file.close()
                path = goal_path
                count += 1
            else:
                count += 1
    json_file.close()
Example #23
0
    def is_wallet_file_valid(cls, path):
        """
        Check if the given wallet file is valid.

        .. note::

            This method only continues reading the file until its
            validity can be determined, and should be preferred instead
            of :meth:`is_wallet_data_valid` when checking a file.

        :param str path: Path to the wallet file

        :returns: True if valid, False otherwise
        :rtype: bool
        """
        with open(path, "rb") as f:
            try:
                for pfx, _, _ in ijson.parse(f):
                    if pfx == "properties.gap_limit":
                        return True
                    if pfx == "wallet_data":
                        return True
            except ijson.JSONError:
                return False

        return False
Example #24
0
    def __extract_books_from_json(self):
        with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file:
            logging.info('reading from json file')
            parser = ijson.parse(input_file)
            books = []
            __found_book = False
            for prefix, event, value in parser:
                if prefix == "items.item.title":
                    __found_book = True
                    book = Book()
                    book.title = value
                    book.author = " "
                    book.publisher = " "
                    logging.info(book.title)
                if prefix == "items.item.products.item.price":
                    book.price = value
                if prefix == "items.item.authors.item.firstName":
                    book.author = value
                if prefix == "items.item.authors.item.lastName":
                    book.author = str(book.author) + " " + value
                if prefix == "items.item.audioPublisherTitle":
                    book.publisher = value
                if __found_book:
                    __found_book = False
                    books.append(book)

            return books
Example #25
0
def AnotherWayToGetData():
    # Get a seq of the Ledger EINs that we want to watch out for.
    our_eins = GetListOfOurEINs()

    # Set up our index file client.
    s3_resource = boto3.resource('s3')
    indicies = Forms990Indicies(s3_resource)
    indicies.save_all_indicies()

    # Start a stream of the index JSON.
    # If an EIN comes through that's in our_eins, take note of the following URL
    should_grab = False

    i = 0

    for fd in indicies.saved_jsons.values():
        parser = ijson.parse(fd)
        for prefix, event, value in parser:
            if event == 'string':
                if prefix.endswith('.item.EIN'):
                    should_grab = value in our_eins
                if should_grab == True and prefix.endswith('.item.URL'):
                    # Uncommenting this would actually grab the resource & log it:
                    #print GetScheduleIUrlOnly(value, s3_resource)
                    i += 1

    print "done, would grab this many: " + str(i)
def doAFile(thisFilename, myEventID):
    global itemList, OBJStack, fileEntry
    FDUseful = open(usefulName, "w")
    FDUseful.write(f"""USEFULPREFIX = {C.OBSSTR}{C.NEWLINE}""")
    FDUseful.flush()
    FDUseful.close()
    with open(f"""{C.CACHEDIR}ijsonOut.json""", "w") as FDOut:
        for prefix, the_type, value in IJ.parse(open(thisFilename)):
            if prefix.find(myEventID) > -1:
                prefix = prefix.replace(myEventID, "$EVENTID$")
            thisTuple = (prefix, the_type, value)
            # itemList.append(thisTuple)
            COMBO = f"""{prefix}::{the_type}"""
            if COMBO not in usefulKeys:
                FDOut.write(str(thisTuple))
                FDOut.flush()
                usefulKeys.append(prefix)
                FDUseful = open(usefulName, "ta")
                outStr = f"""{C.TABSTR}{C.DQTSTR}{COMBO}{C.DQTSTR},{C.NEWLINE}"""
                FDUseful.write(outStr)
                FDUseful.flush()
                FDUseful.close()
                FDMoreUseful = open(moreUsefulName, "ta")
                outStr = f"""({C.DQTSTR}{prefix}{C.DQTSTR}, {C.DQTSTR}{the_type}{C.DQTSTR}, C.TYPE, """
                outStr += f"""{C.DQTSTR}headerName{C.DQTSTR}, SQLDEFAULT, SCNDEFAULT,),{C.NEWLINE}"""
                FDMoreUseful.write(f"""{str(thisTuple)}{C.NEWLINE}""")
                FDMoreUseful.flush()
                FDMoreUseful.close()
    FDUseful = open(usefulName, "ta")
    outStr = f"""{C.CBSSTR}{C.NEWLINE}"""
    FDUseful.write(outStr)
    FDUseful.flush()
    FDUseful.close()
Example #27
0
def extractAllDescriptions(from_path, to_path, extension=".json", chunk_dim=50000, continue_with=0):
    count = 0
    fileNumber = 0
    with open(from_path, 'r') as f:
        parser = ijson.parse(f)
        obj = {}
        if fileNumber >= continue_with:
            w = codecs.open(to_path +"-"+ str(fileNumber) + extension, "w", "utf-8")
        for prefix, event, value in parser:

            if prefix=="item.id":
                obj["id"] = value
            elif prefix=="item.descriptions.en.value" and len(value.split())>1:
                obj["description"] = value
                if fileNumber >= continue_with:
                    json.dump(obj, w)
                obj = {}
                count+=1
                if count %chunk_dim ==0:
                    if fileNumber >= continue_with:
                        w.close()
                    fileNumber += 1
                    if fileNumber >= continue_with:
                        w = codecs.open(to_path +"-"+ str(fileNumber) + extension, "w", "utf-8")
                    print(count)
                else:
                    w.write("\n")
        w.close()
def get_sku(location, instance_type, products_file):
    """
    Optimized JSON parsing to find the SKU for the provided location and type.
    """
    # SKU dicts have prefixes like 76V3SF2FJC3ZR3GH
    sku_dict_prefix = re.compile('^[a-zA-Z0-9]+$')
    sku = ''
    matches = 0
    event_count = 0
    with open(products_file) as f:
        parser = ijson.parse(f)
        for prefix, event, value in parser:
            event_count += 1
            if prefix.endswith('.sku'):
                # Save the SKU of the current SKU dict
                sku = value
            elif prefix.endswith(
                    '.productFamily') and value == "Compute Instance":
                matches += 1
            elif prefix.endswith('.location') and value == location:
                matches += 1
            elif prefix.endswith('.instanceType') and value == instance_type:
                matches += 1
            elif event == 'end_map' and sku_dict_prefix.match(prefix):
                # We've reached the end of the SKU dict, is this the right one?
                if matches == 3:
                    # All three values matched, this is our sku
                    logger.debug("SKU: {}".format(sku))
                    return sku
                else:
                    # This wasn't the right SKU dict, reset our matches
                    matches = 0
def doAFile(thisFilename, myEventID=""):
	global fileEntry, usefulName, moreUsefulName, usefulBit, moreUsefulBit, dirPFX
	DBLinks = DB.doOpen(C.SQLCONFIG)
	# FDUseful = open(usefulName, "w")
	# FDUseful.write(f"""USEFULPREFIX = {C.OBSSTR}{C.NEWLINE}""")
	# FDUseful.flush()
	# FDUseful.close()
	for prefix, the_type, value in IJ.parse(open(thisFilename)):
		if prefix == "" or prefix is None:
			continue
		if prefix.find(myEventID) > -1 and myEventID != "":
			prefix = prefix.replace(myEventID, "$EVENTID$")
		thisTuple = (prefix, the_type, value)
		# itemList.append(thisTuple)
		if DB.checkUpdatePrefix(DBLinks, prefix) is False:
			emptyPFXDict = C.PREFIXEMPTYDICT()
			emptyPFXDict[C.PFXlastSeen] = TDS.nowStrSql(TDS.DT.now())
			emptyPFXDict[C.PFXfirstSeen] = TDS.nowStrSql(TDS.DT.now())
			emptyPFXDict[C.PFXprefixStr] = prefix
			emptyPFXDict[C.PFXkeyType] = the_type
			DB.insertDict(DBLinks, C.GEOJSONPREFIXTABLENAME, emptyPFXDict, C.MYFIELDTYPESDICT)
		# FDUseful = open(usefulName, "ta")
		# outStr = f"""{C.TABSTR}{C.DQTSTR}{COMBO}{C.DQTSTR},{C.NEWLINE}"""
		# FDUseful.write(outStr)
		# FDUseful.flush()
		# FDUseful.close()
		FDMoreUseful = open(moreUsefulName, "ta")
		outStr = f"""({C.DQTSTR}{prefix}{C.DQTSTR}, {C.DQTSTR}{the_type}{C.DQTSTR}, C.TYPE, """
		outStr += f"""{C.DQTSTR}headerName{C.DQTSTR}, SQLDEFAULT, SCNDEFAULT,),{C.NEWLINE}"""
		FDMoreUseful.write(f"""{str(thisTuple)}{C.NEWLINE}""")
		FDMoreUseful.flush()
		FDMoreUseful.close()
Example #30
0
def extract_path_value_pairs_from_json_iter(inp):
    if isinstance(inp, str):
        inp = io.StringIO(inp)

    path_set = set()

    stack = []

    def get_path():
        q = ''.join(stack)
        if not q.startswith('.'):
            return '.' + q
        return q

    for prefix, event, value in ijson.parse(inp):
        if event == 'start_array':
            stack.append('[]')
        elif event == 'map_key':
            stack.pop(
            )  # remove a previous key or a dummy (in case of the first element).
            stack.append('.' + value)
        elif event == 'start_map':
            stack.append(None)  # dummy
        elif event in ('end_array', 'end_map'):
            stack.pop()
        else:
            assert event in ('boolean', 'number', 'string', 'null')
            q = get_path()
            if q not in path_set:
                yield q, value
                path_set.add(q)
Example #31
0
    def parse(self, response):
        f = urllib.urlopen(open("spiders/amazon/query.txt").read())
        parser = ijson.parse(f)

        for prefix, event, value in parser:
            if prefix == "jobs.item.url":
                yield Request("http://www.amazon.jobs%s" % value, callback=self.parse_link)
Example #32
0
def detect_format(path, root_path=''):
    """
    Returns the format of OCDS data, and whether the OCDS data is concatenated or in an array.

    If the OCDS data is concatenated or in an array, assumes that all items have the same format as the first item.

    :param str path: the path to a file
    :param str root_path: the path to the OCDS data within the file
    :returns: the format, whether data is concatenated, and whether data is in an array
    :rtype: tuple
    :raises UnknownFormatError: if the format cannot be detected
    """
    with open(path, 'rb') as f:
        events = iter(ijson.parse(f, multiple_values=True))

        while True:
            prefix, event, value = next(events)
            if prefix == root_path:
                break

        if prefix:
            prefix += '.'

        if event == 'start_array':
            prefix += 'item.'
        elif event != 'start_map':
            raise UnknownFormatError(
                'top-level JSON value is a {}'.format(event))

        records_prefix = '{}records'.format(prefix)
        releases_prefix = '{}releases'.format(prefix)
        ocid_prefix = '{}ocid'.format(prefix)
        tag_item_prefix = '{}tag.item'.format(prefix)

        has_records = False
        has_releases = False
        has_ocid = False
        has_tag = False
        is_compiled = False
        is_array = event == 'start_array'

        for prefix, event, value in events:
            if prefix == records_prefix:
                has_records = True
            elif prefix == releases_prefix:
                has_releases = True
            elif prefix == ocid_prefix:
                has_ocid = True
            elif prefix == tag_item_prefix:
                has_tag = True
                if value == 'compiled':
                    is_compiled = True
            if not prefix and event not in ('end_array', 'end_map', 'map_key'):
                return _detect_format_result(True, is_array, has_records,
                                             has_releases, has_ocid, has_tag,
                                             is_compiled)

        return _detect_format_result(False, is_array, has_records,
                                     has_releases, has_ocid, has_tag,
                                     is_compiled)
Example #33
0
 def collect_data(self):
     outfile = open('/opt/projects/domain_nlp/sentences','w+', buffering=False)
     story_title=None
     parsed = ijson.parse(open(self.json_filename),buf_size=1024)
     comments = []
     line = 0
     while True:
         key,val = parsed.next()
         if not key and not val:
             break
     
         line+=1
     
         if key == 'map_key': 
             next_key,next_val = parsed.next()
             line+=1
     
             if val == 'points':
                 points = next_val
                 if points < 1:
                     continue
             elif val == 'story_title':
                 story_title = next_val
     
             elif val == 'comment_text' and next_val: 
                 comment_text = self.sanitize_text(next_val, story_title)
                 outfile.writelines(comment_text)
                 
     #            comments.append({'comment':comment_text,'points':points,
     #                             'story_title':story_title})
     
         if line % 100000 == 0:
             print len(comments)
 
     return comments
Example #34
0
def retrieve_from_mgrast(meta_id, stage_name="upload"):
    url = "http://api.metagenomics.anl.gov/1/download/mgm" + meta_id
    print("Retrieving " + meta_id + " from " + url)
    
    res = urllib2.urlopen(url)
    #page = res.read()
    
    #ip = ijson.parse(page)
    ip = ijson.parse(res)
    
    elements = {}
    state = 0
    for p, e, v in ip:
        #print str(p) + ":" + str(e)
        if state == 2:
            #print str(p) + ":" + str(e)
            if str(p) == data_item_file_name:
                elements[data_item_file_name] = str(v)
            if str(p) == data_item_url:
                elements[data_item_url] = str(v)

        if state == 1:
            if str(p) == 'data.item.stage_name' and str(v) == stage_name:
                state = 2
                
        if str(p) == 'data.item' and str(e) == 'start_map':
            #print("start_map")
            state = 1

        if str(p) == 'data.item' and str(e) == 'end_map':
            #print("end_map")
            state = 0
            
    return elements
def distributed_save():
    f = open('namuwiki_20190312.json')
    data = ijson.parse(f)
    p = open('namu_sentences.txt', 'w', encoding='utf-8')
    for prefix, event, value in data:
        if prefix == "item.text":
            sentences = re.compile('[\n]').split(value)

            for i, s in enumerate(sentences):
                try:
                    sentences[i] = get_namuwiki_text(s.strip())
                except Exception as e:
                    print(e, "in sentence", s)

            new_s = '\n'.join(sentences)

            new_s = re.compile('[]]|[[]|[|][|]|[{][{][{]|[}][}][}]').sub(
                '', new_s)
            new_s = re.compile('[.]+|[?]|[\n]').split(new_s)
            new_s = [s.strip() for s in new_s if len(s) > 2]

            p.write('\n'.join(new_s) + '\n')
            print(new_s)

        elif prefix == "item.title":
            p.write(value + '\n')

    p.close()
Example #36
0
 def get_tables(self):
     prefixes = dict()
     with open(self._filename) as f:
         for prefix, event, _ in ijson.parse(f):
             if event in ("start_map", "start_array"):
                 prefixes[".".join([self._root_prefix, prefix])] = None
     return [self._root_prefix] + list(prefixes.keys())[1:]
Example #37
0
def load_json(filename):
    with open(filename, 'r') as fd:
        parser = ijson.parse(fd)
        ret = {'signatures': {}, 'antivirus': {}}
        for prefix, event, value in parser:
            if prefix == "signatures.item.name":
                signature = value
                ret['signatures'][signature] = {}
                ret['signatures'][signature]['markcount'] = markcount
                ret['signatures'][signature]['severity'] = severity
                ret['signatures'][signature]['description'] = description
            if prefix == "signatures.item.markcount":
                markcount = str(value)
            if prefix == "signatures.item.severity":
                severity = str(value)
            if prefix == "signatures.item.description":
                description = str(value)
            if prefix.startswith("virustotal.scans") and prefix.endswith(
                    '.result'):
                # print(value)
                # av_name=prefix.split('.')[2]
                if value is not None:
                    av_name = prefix.split('.')[2]
                    ret['antivirus'][av_name] = value
        return ret
Example #38
0
    def _parse_categories_json(cls):
        categories_map = {}

        categories_json_stream = cls._request_epg_json(VaderStreamsConstants.CATEGORIES_PATH,
                                                       VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME,
                                                       {})

        logger.debug('Processing VaderStreams JSON categories\n'
                     'File name => {0}'.format(VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME))

        try:
            ijson_parser = ijson.parse(categories_json_stream)

            for (prefix, event, value) in ijson_parser:
                if event == 'string':
                    categories_map[int(prefix)] = value

            logger.debug('Processed VaderStreams JSON categories\n'
                         'File name => {0}'.format(VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME))
        except Exception:
            logger.debug('Failed to process VaderStreams JSON categories\n'
                         'File name => {0}'.format(VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME))

            raise

        return categories_map
Example #39
0
    def extract_iocs(self, report):
        iocs = set()
        parser = ijson.parse(report)
        lines = ""
        for prefix, event, value in parser:
            if prefix in [
                    "analysis.behavior.network.tcp.packet.item.srcip",
                    "analysis.behavior.network.tcp.packet.item.dstip",
                    "analysis.behavior.network.udp.packet.item.srcip",
                    "analysis.behavior.network.udp.packet.item.dstip",
                    "analysis.behavior.network.dns.packet.item.name",
            ]:
                if not value.startswith("192.168."):
                    iocs.add(value)
            elif prefix in [
                    "analysis.behavior.network.http.packet.item.header",
                    "analysis.behavior.network.https.packet.item.header",
                    "analysis.behavior.network.sslhttp.packet.item.header",
            ]:
                lines = ""
            elif prefix == "analysis.behavior.network.http.packet.item.header.line.item":
                lines += "{}\n".format(value)
                self.extract_url("http", iocs, lines)
            elif prefix in [
                    "analysis.behavior.network.https.packet.item.header.line.item",
                    "analysis.behavior.network.sslhttp.packet.item.header.line.item"
            ]:
                lines += "{}\n".format(value)
                self.extract_url("https", iocs, lines)

        for ioc in iocs:
            self.add_ioc(ioc)
Example #40
0
File: sj.py Project: snakeyjson/sj
def yield_offenders(stream, matcher=KEY_RE):
    parser = ijson.parse(stream)
    for prefix, event, value in parser:
        if event != 'map_key':
            continue
        if matcher.match(value):
            continue
        yield prefix, value
Example #41
0
def iter_large_json(json_file, prefix_value, event_value):
    import ijson

    parser = ijson.parse(open(json_file))

    for prefix, event, value in parser:
        # For Flowdock data ('item.content', 'string')
        if (prefix, event) == (prefix_value, event_value):
            yield value
Example #42
0
def test__json_next_atomic_array():
    t = (2,3,4,5,6)
    s = json.dumps(t)
    if sys.version_info[0] < 3:
        s = unicode(s)
    it = ijson.parse(io.StringIO(s))
    a = _json_next_atomic_array(it)
    assert len(t) == len(a)
    assert all(x == y for x,y in zip(t, a))
Example #43
0
def json_usage():
    file = r'E:\store_data\meituan.json'
    start = 0
    with codecs.open(file,'r',encoding='utf8') as f:
        objects = ijson.parse(f)
        for prefix,event,value in objects:
            print(prefix,event,value)
            if start >200:
                break
            start+=1
def getJsonFile():
  if LOCAL == None:
    downloadFile()
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_data=open(curr_path+'/'+FILENAME)
  else:
    json_data=open(LOCAL)

  f = ijson.parse(json_data)
  return f
Example #45
0
	def fromJson (self, s, *args):
		import ijson
		it = ijson.parse (Stream (s))
		def _read():
			try:
				e = it.next()
			except StopIteration:
				return self.eos
				
			return [[e[1], e[2]], Lazy (_read)]
		return _read()
    def exec_command(self, string_return=False):
        """

        :param string_return: if true, returns a string summary. Otherwise, writes to out_file
        :return:
        """

        json_stream = open(self._in_file)

        elem_id = None
        elem_type = None
        desc_en = None
        label_en = None
        properties = {}
        current_claim_key = None

        elem_count = 1

        for prefix, event, value in ijson.parse(json_stream):
            if event == "end_map":
                if prefix == "item":
                    self._process_current_data(elem_id, elem_type, desc_en, label_en, properties)
                    elem_id = None
                    elem_type = None
                    desc_en = None
                    current_claim_key = None
                    label_en = None
                    properties = {}
                    elem_count += 1
                    if elem_count % 500 == 0:
                        print "Llevamos " + str(elem_count) + " elementos"
                if prefix == "item.claims." + str(current_claim_key) + ".item":
                    # print 'item.claims.' + str(current_claim_key) + '.item'
                    properties[current_claim_key] += 1
            elif event == "string":
                if prefix == "item.id":
                    elem_id = value
                elif prefix == "item.type":
                    elem_type = value
                elif prefix == "item.descriptions.en.value":
                    desc_en = value
                elif prefix == "item.labels.en.value":
                    label_en = value
            elif event == "map_key" and prefix == "item.claims":
                properties[value] = 0
                current_claim_key = value

        print "Errores en propiedades: ", self._err_count_prop
        print "Errores en items: ", self._err_count_item

        if not string_return:
            self._write_to_file()
        else:
            return self._get_string_return()
    def _parse_dump_file(self):
        json_stream = open(self._in_dump_file, "r")
        elem_id = None
        elem_type = None
        desc_en = None
        label_en = None
        datatype = None
        datavalue_type = None
        current_claim_key = None
        datavalue_num_id = None
        possible_edges = []

        elem_count = 1

        for prefix, event, value in ijson.parse(json_stream):
            if event == 'end_map':
                if prefix == 'item':
                    for tuple_4 in possible_edges:
                        if self._is_valid_edge(elem_type, tuple_4[0],
                                               tuple_4[1]):  # triple: datatype, datavalue_type, datavalue_num_id
                            self._add_triple_if_proceed(elem_id, tuple_4[2], 'Q' + tuple_4[3], label_en, desc_en)
                            # print elem_id, tuple_4[2], 'Q' + tuple_4[3]
                    elem_id = None
                    elem_type = None
                    current_claim_key = None
                    # label_en = None
                    datavalue_num_id = None
                    datavalue_type = None
                    elem_count += 1
                    possible_edges = []
                    if elem_count % 10000 == 0:
                        print 'Llevamos ' + str(elem_count)
                elif prefix == "item.claims." + str(current_claim_key) + ".item":
                    possible_edges.append((datatype, datavalue_type, current_claim_key, str(datavalue_num_id)))

            elif event == 'string':
                if prefix == 'item.id':
                    elem_id = value
                elif prefix == 'item.type':
                    elem_type = value
                elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datatype':
                    datatype = value
                elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datavalue.value.entity-type':
                    datavalue_type = value
                elif prefix == 'item.labels.en.value':
                    label_en = value
                elif prefix == 'item.descriptions.en.value':
                    desc_en = value
            elif event == 'map_key' and prefix == 'item.claims':
                current_claim_key = value
            elif event == 'number' and prefix == 'item.claims.' + str(
                    current_claim_key) + '.item.mainsnak.datavalue.value.numeric-id':
                datavalue_num_id = value
Example #48
0
 def __init__(self, in_stream):
     if in_stream is None:
         raise AssertionError('input stream must not be none')
     self.__parser = ijson.parse(in_stream)
     self.__pre_meta_data = []
     self.__post_meta_data = []
     self.__aspect_element_counts = {}
     self.__aspect_names = set()
     self.__first_element = None
     for e in self.aspect_elements():
         if e is not None:
             self.__first_element = e
             break
def main(args):
    print("started at {0}".format(time.time()))

    parser = ijson.parse(open(args.json_file))
    session = requests.Session()
    parallelJobs = pp.Server()
    parallelJobs.set_ncpus(args.threads)

    for prefix, event, value in parser:
        if value is not None and event != 'map_key':

            # ijson sends the prefix as a string of keys connected by periods,
            # but Firebase uses periods for special values such as priority.
            # 1. Find '..', and store the indexes of the second period
            doublePeriodIndexes = [m.start() + 1 for m in re.finditer('\.\.', prefix)]
            # 2. Replace all '.' with ' '
            prefix = prefix.replace('.', ' ')
            # 3. Use stored indexes of '..' to recreate second periods in the pairs of periods
            prefixList = list(prefix)
            for index in doublePeriodIndexes:
                prefixList[index] = '.'
            prefix = "".join(prefixList)
            # 4. Split on whitespace
            prefixes = prefix.split(' ')
            lastPrefix = prefixes[-1]
            prefixes = prefixes[:-1]

            url = args.firebase_url
            for prefix in prefixes:
                url += prefix + '/'
            url += '.json'
            if args.silent:
                url += '?print=silent'

            if not args.priority_mode:
                if lastPrefix == '.priority':
                    continue
            else:
                if lastPrefix != '.priority':
                    continue

            if event == 'number':
                dataObj = {lastPrefix: float(value)}
            else:
                dataObj = {lastPrefix: value}

            try:
                parallelJobs.submit(sendData, (url, dataObj, session, args), (), ("json", "requests"))
            except Exception, e:
                print('Caught an error: ' + traceback.format_exc())
                print prefix, event, value
Example #50
0
def jsonObjectReader(filepath):
    """
    Creates a generator that parses an array of json objects from a valid
    json array file, yielding each top level json object in the array.

    :param filepath: path to json file.
    """
    top_level_array = False
    array_stack = 0
    top_level_object = False
    object_stack = 0
    parser = ijson.parse(open(filepath, 'r'))

    for prefix, event, value in parser:
        if event == 'start_array':
            if not top_level_array:
                top_level_array = True
                continue
            else:
                array_stack += 1
        if event == 'start_map':
            if not top_level_object:
                top_level_object = True
                builder = ijson.ObjectBuilder()
            else:
                object_stack += 1
        if event == 'end_map':
            if not top_level_object:
                raise Exception('end_map without a top level object')
            else:
                if object_stack == 0:
                    top_level_object = False
                    yield builder.value
                else:
                    object_stack -= 1
        if event == 'end_array':
            if not top_level_array:
                raise Exception('end_array without a top level array')
            else:
                if array_stack == 0:
                    top_level_array = False
                else:
                    array_stack -= 1
        # convert Decimal to float because mongo can't serialize Decimal
        # TODO is this the right place to do this? Should it be done instead
        # upon save?
        if isinstance(value, decimal.Decimal):
            # TODO this has different behavior on python 2.6 vs 2.7 due to
            # different rounding behavior
            value = float(value)
        builder.event(event, value)
def mergeDetails(cids, worker, project):
   data = []
   missed = []
   for cid in cids:
      res = getGerritChangeRequest(cid)
      parser = ijson.parse(res)
      author = ''
      date = ''
      time = ''
      patch = ''
      missedpatch = ''
      for prefix, event, value in parser:
         if prefix == 'messages.item.author.name':
            author = value
         if prefix == 'messages.item.date':
            cutoff = len(value)-3
            date = datetime.strptime(value[:cutoff],"%Y-%m-%d %H:%M:%S.%f")
         if prefix == 'messages.item.message' and author == worker:
            dat = value.strip().split(':')
            if 'patch' in dat[0].lower():
               patch = dat[0].split()
               patch = re.sub("[^0-9]","",patch[len(patch)-1])
            if 'build successful' in value.lower() or 'build succeeded' in value.lower() and not "FAILURE" in value:
               success = True 
            elif 'build failed' in value.lower() or 'build unsuccessful' in value.lower() or "FAILURE" in value:
               success = False
            else:
               continue
            try:
               item = [int(cid),int(patch),date.date(),date.time(),success]
               data += [item] 
            except:
               continue
         elif prefix == 'messages.item.message' and author != worker:
            if 'starting check jobs' in value.lower():
               continue
            dat = value.strip().split(':')
            if 'patch' in dat[0].lower():
               missedpatch = dat[0].split()
               missedpatch = re.sub("[^0-9]","",missedpatch[len(missedpatch)-1])
            try:
               missed += [[int(cid),int(missedpatch),date.date(),date.time()]]
            except:
               continue
      if len(data) >= 10:
         mergeChunk(data,missed,worker,project)
         data = []
         missed = []

   mergeChunk(data,missed,worker,project)
Example #52
0
    def __superficial_check(cls, fd):
        """Check if the cis and links field are a list. If not, raise a
        jsonschema.ValidationError. It move the cursor of the fd to back 0."""

        # cis and links store if a field cis and links are found in the import
        # *_start store if a the beginning of a json array are found in the cis
        # and links fields of an import
        # *_end store if a the end of a json array are found in the cis and
        # links fields of an import

        cis = False
        cis_start = False
        cis_end = False
        links = False
        links_start = False
        links_end = False

        parser = ijson.parse(fd)
        for prefix, event, _ in parser:
            if prefix == "cis":
                cis = True
                if event == "end_array":
                    cis_end = True
                if event == "start_array":
                    cis_start = True
            if prefix == "links":
                links = True
                if event == "end_array":
                    links_end = True
                if event == "start_array":
                    links_start = True

        fd.seek(0)

        cis_status = (cis, cis_start, cis_end)
        links_status = (links, links_start, links_end)

        # ok is a filter to ascertain if a cis/link field of an import is
        # correct.
        ok = [(True, True, True), (False, False, False)]

        if cis_status in ok and links_status in ok:
            return True
        elif cis_status not in ok and links_status not in ok:
            raise jsonschema.ValidationError(
                "CIS and LINKS should be an array.")
        elif cis_status not in ok:
            raise jsonschema.ValidationError("CIS should be an array.")
        elif links_status not in ok:
            raise jsonschema.ValidationError("LINKS should be an array.")
Example #53
0
 def sniff(self):
     with self.open() as fh:
         try:
             parser = ijson.parse(fh)
             for prefix, event, value in parser:
                 if (prefix, event) == ('', 'map_key'):
                     # `format_url` seems pretty unique to BIOM 1.0.
                     if value == 'format_url':
                         return True
                     elif value not in self.top_level_keys:
                         return False
         except (ijson.JSONError, UnicodeDecodeError):
             pass
         return False
    def _read_edges(self):
        json_stream = open(self._in_file)
        elem_id = None
        elem_type = None
        # desc_en = None
        # label_en = None
        datatype = None
        datavalue_type = None
        current_claim_key = None
        datavalue_num_id = None
        possible_edges = []

        elem_count = 1

        for prefix, event, value in ijson.parse(json_stream):
            if event == 'end_map':
                if prefix == 'item':
                    for tuple_4 in possible_edges:
                        if self._is_valid_edge(elem_type, tuple_4[0],
                                               tuple_4[1]):  # triple: datatype, datavalue_type, datavalue_num_id
                            yield (elem_id, 'Q' + tuple_4[3])
                            # pass
                    elem_id = None
                    elem_type = None
                    current_claim_key = None
                    # label_en = None
                    datavalue_num_id = None
                    datavalue_type = None
                    elem_count += 1
                    possible_edges = []
                    if elem_count % 10 == 0:
                        print 'Llevamos ' + str(elem_count) + ' elementos'
                elif prefix == "item.claims." + str(current_claim_key) + ".item":
                    possible_edges.append((datatype, datavalue_type, current_claim_key, str(datavalue_num_id)))
            elif event == 'string':
                if prefix == 'item.id':
                    elem_id = value
                elif prefix == 'item.type':
                    elem_type = value
                elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datatype':
                    datatype = value
                elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datavalue.value.entity-type':
                    datavalue_type = value
                    # elif prefix == 'item.labels.en.value':
                    #     label_en = value
            elif event == 'map_key' and prefix == 'item.claims':
                current_claim_key = value
            elif event == 'number' and prefix == 'item.claims.' + str(
                    current_claim_key) + '.item.mainsnak.datavalue.value.numeric-id':
                datavalue_num_id = value
Example #55
0
def _analyze_possibly_broken_json(line: str) -> Iterator[dict]:
    readers: Any = {}  # too complicated for proper types
    f = io.StringIO(line)
    data: Union[Dict, List]
    try:
        for key, type_, value in ijson.parse(f):
            listeners = registry.get(key, [])
            for reader in readers.values():
                _, stack, property_name = reader
                parent = stack[-1]
                if type_ == 'start_map':
                    data = {}
                    stack.append(data)
                    if isinstance(parent, dict):
                        parent[property_name] = data
                    else:
                        parent.append(data)
                    continue
                if type_ in ('end_map', 'end_array'):
                    stack.pop()
                    continue
                if type_ == 'map_key':
                    reader[2] = value
                    continue
                if type_ == 'start_array':
                    data = []
                    stack.append(data)
                    if isinstance(parent, dict):
                        parent[property_name] = data
                    else:
                        parent.append(data)
                    continue
                if isinstance(parent, dict):
                    parent[property_name] = value
                else:
                    parent.append(value)
            for func in listeners:
                if type_ == 'start_map':
                    # Start reading
                    initial_data: Dict = {}
                    readers[func] = [initial_data, [initial_data], None]
                elif type_ == 'end_map':
                    yield from func(readers.pop(func)[0])
    except (ijson.common.IncompleteJSONError, ijson.backends.python.UnexpectedSymbol):
        pass
    finally:
        f.close()
    for func, reader in readers.items():
        for result in func(reader[0]):
            yield result
    def yield_elements(self):
        json_stream = open(self._in_file)

        elem_id = None
        elem_type = None
        desc_en = None
        label_en = None
        properties = []
        current_claim_key = None

        elem_count = 1

        for prefix, event, value in ijson.parse(json_stream):
            if event == 'end_map':
                if prefix == 'item':
                    if elem_type == 'item':
                        yield WikidataEntity(entity_id=elem_id,
                                             label=label_en,
                                             description=desc_en,
                                             outcoming_properties_id=properties)
                    elif elem_type == 'property':
                        yield WikidataProperty(property_id=elem_id,
                                               label=label_en,
                                               description=desc_en,
                                               outcoming_properties_id=properties)
                    elem_id = None
                    elem_type = None
                    desc_en = None
                    current_claim_key = None
                    label_en = None
                    properties = []
                    elem_count += 1
                    if elem_count % 500 == 0:
                        print 'Llevamos ' + str(elem_count) + ' elementos'
                if prefix == 'item.claims.' + str(current_claim_key) + '.item':
                    # print 'item.claims.' + str(current_claim_key) + '.item'
                    properties.append(current_claim_key)
            elif event == 'string':
                if prefix == 'item.id':
                    elem_id = value
                elif prefix == 'item.type':
                    elem_type = value
                elif prefix == 'item.descriptions.en.value':
                    desc_en = value
                elif prefix == 'item.labels.en.value':
                    label_en = value
            elif event == 'map_key' and prefix == 'item.claims':
                current_claim_key = value
def get_bucket_primary_dict(json_file_path):
	bucket_primary_dict={}
	bucket_name = None
	primary_peer = None
	with open(json_file_path, 'r') as f:
		parser = ijson.parse(f)
		for prefix, event, value in parser:
			if prefix == "entry.item.name":
				bucket_name = value
			if value == "0xffffffffffffffff":
				primary_peer = prefix
			if bucket_name is not None and primary_peer is not None:
				bucket_primary_dict[bucket_name]=primary_peer
				bucket_name = None
				primary_peer = None


	return bucket_primary_dict
def main():
    # Establish communication queues
    tasks = multiprocessing.JoinableQueue(16)
    results = multiprocessing.Queue()

    # Start consumers
    num_consumers = multiprocessing.cpu_count() * 2
    print('Creating %d consumers' % num_consumers, file=sys.stderr)
    consumers = [ Consumer(tasks, results)
                  for i in xrange(num_consumers) ]
    for w in consumers:
        w.start()

    r = requests.post(NEO4J_CREATE_TRAN_URL, headers=generateheaders(),data=json.dumps(GET_PDB_IDS_NEO4J_STATEMENT),
                      stream=True)
    parser = ijson.parse(r.raw)
    i = 0

    buildingObject = False
    pdb_id_list = []
    builder = None
    counter = 0
    for prefix, event, value in parser:
        if (not buildingObject) & ((prefix, event) == ('results.item.data.item', 'start_map')):
            buildingObject = True
            builder = ijson.ObjectBuilder()
            builder.event(event, value)
        elif buildingObject & ((prefix, event) == ('results.item.data.item', 'end_map')):
            buildingObject = False
            builder.event(event, value)
            # put builder.value as object to work on
            # process_pdb_id(value)
            tasks.put(betaSheetOrder(builder.value["row"], counter), True, None)
            counter+=1
        elif buildingObject:
            builder.event(event, value)

    print("%d domains found" %counter, file=sys.stderr)
        # Add a poison pill for each consumer
    for i in xrange(num_consumers):
        tasks.put(None)

    # Wait for all of the tasks to finish
    tasks.join()
def get_json_via_ijson(url):
    """
    This function creates a parser object from a json url 
    
    Paratmeters
    -----------
    url - an url ending in .json
    
    Returns 
    -------
    An ijson parsing object 
    """
    from ijson import parse
    
    # load the json
    f = urllib2.urlopen(url, context = gcontext)
    # PArse the json
    parser = parse(f)
    return parser  
Example #60
0
 def __init__(self, in_stream):
     """ Creates a new CxReader for reading from "in_stream".
     :param in_stream: object
                       A file-like object to read from
     """
     if in_stream is None:
         raise AssertionError('input stream must not be none')
     self.__parser = ijson.parse(in_stream)
     self.__pre_meta_data = []
     self.__post_meta_data = []
     self.__aspect_element_counts = {}
     self.__aspect_names = set()
     self.__first_element = None
     self.__number_verification = None
     self.__status = None
     for e in self.aspect_elements():
         if e is not None:
             self.__first_element = e
             break