def load_data(path):

    with open(path, 'r') as face_emotion_data:

        filed_names = ['emotion', 'pixels', 'Usage']
        reader = DictReader(face_emotion_data, fieldnames=filed_names)
        reader.next()

        train_data = []
        train_labels = []
        test_data = []
        test_labels = []

        for row in reader:
            row['pixels'] = [
                float(pixel) / 255 for pixel in row['pixels'].split()
            ]
            image = np.array(row['pixels'], dtype='float32').reshape(
                (1, 48, 48))
            if row['Usage'] == 'Training':
                train_data.append(image)
                train_labels.append(int(row['emotion']))
            else:
                test_data.append(image)
                test_labels.append(int(row['emotion']))

    train_data = np.array(train_data)
    train_labels = np_utils.to_categorical(train_labels, 7)
    test_data = np.array(test_data)
    test_labels = np_utils.to_categorical(test_labels, 7)

    return train_data, test_data, train_labels, test_labels
Beispiel #2
0
def upload_resources(filename, skip=0, limit=None):
    """Upload  from a CSV file."""
    # Use sys.stdout.write so resources can be printed nicely and succinctly
    import sys

    date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d')
    bool_converter = lambda s: s == "true"
    resource_schema = facility_schema['fields']
    
    convert_map = {
		'integer': int,
		'float': float,
		'datetime': date_converter,
		'boolean': bool_converter
    }

    convert = {}
    
    for k, v in resource_schema.items():
		field_type = v.get('type')
		if convert_map.has_key(field_type):
			convert[k] = convert_map[field_type]

    def print_flush(msg):
        sys.stdout.write(msg)
        sys.stdout.flush()

    facility_code = facility_schema['facility_code']
    print_every = 1000
    print_flush("Adding resources. Please be patient.")

    with open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            actual_index = i + skip + 2
            do_print = actual_index % print_every == 0
            try:
                d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
                coords = [d.pop('longitude', None), d.pop('latitude', None)]
                if coords[0] and coords[1]:
                    d['location'] = {'type': 'Point', 'coordinates': coords}
                d['facility_code'] = facility_code
                if not check(add_document(facility_schema['endpoint'], d), 201, False):
                    raise Exception()
                if do_print:
                    print_flush(".")

            except Exception as e:
                print "Error adding resource", e
                pprint(d)
                exit()

            if limit and i >= limit:
                break
    # Create a 2dsphere index on the location field for geospatial queries
	app.data.driver.db['resources'].create_index([('location', '2dsphere')])
    print "Resources uploaded!"
def upload_waterpoints(filename, skip=0, limit=None):
    """Upload waterpoints from a CSV file."""
    date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d')
    bool_converter = lambda s: s == "true"

    status_map = {
        "non functional": "not functional",
        "functional needs repair": "needs repair"
    }

    status_converter = lambda s: status_map.get(s.lower(), s.lower())

    convert = {
        'gid': int,
        'object_id': int,
        'valid_from': date_converter,
        'valid_to': date_converter,
        'amount_tsh': float,
        'breakdown_year': int,
        'date_recorded': date_converter,
        'gps_height': float,
        'latitude': float,
        'longitude': float,
        'num_private': int,
        'region_code': int,
        'district_code': int,
        'population': int,
        'public_meeting': bool_converter,
        'construction_year': int,
        'status_group': status_converter
    }

    facility_code = "wpf001"

    with open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            print "Adding line", i + skip + 2

            try:
                d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
                coords = [d.pop('longitude'), d.pop('latitude')]
                d['location'] = {'type': 'Point', 'coordinates': coords}
                d['facility_code'] = facility_code
                if not check(add_document('waterpoints', d)):
                    raise Exception()

            except Exception as e:
                print "Error adding waterpoint", e
                pprint(d)
                exit()

            if limit and i >= limit:
                break
    # Create a 2dsphere index on the location field for geospatial queries
    app.data.driver.db['facilities'].create_index([('location', '2dsphere')])
Beispiel #4
0
  def __init__(self, csv):
    self.bag = Counter()
    reader = DictReader(open(csv, 'r'), fieldnames=[
      "TileFile", "Borders", "Quantity", "Features", "Notes"])
    reader.next()  # skip header, we've defined our own

    for tile_dict in reader:
      tile = Tile.from_csv(tile_dict)
      quantity = int(tile_dict["Quantity"].strip())
      self.bag[tile] = quantity
      if "B" in tile_dict["Features"]:
        self.first_tile = tile
Beispiel #5
0
    def __init__(self, csv):
        self.bag = Counter()
        reader = DictReader(open(csv, 'r'),
                            fieldnames=[
                                "TileFile", "Borders", "Quantity", "Features",
                                "Notes"
                            ])
        reader.next()  # skip header, we've defined our own

        for tile_dict in reader:
            tile = Tile.from_csv(tile_dict)
            quantity = int(tile_dict["Quantity"].strip())
            self.bag[tile] = quantity
            if "B" in tile_dict["Features"]:
                self.first_tile = tile
    def test_collectl_file_iterator(self):
        '''test for CollectlFileIterator class'''

        comments = [
            '#COMMENT LINE 1\n', '# COMMENT 2\n',
            '############ COMMENT 3 ######\n'
        ]
        header = ['#Date,Time,Number\n']
        data = ['20130102,01:23:45.56,5\n', '20120807,15:30:00.001,7']

        mystring = StringIO.StringIO(''.join(comments) + ''.join(header) +
                                     ''.join(data))
        myfile = CollectlFileIterator(mystring)
        reader = DictReader(myfile)

        self.assertEqual(reader.fieldnames, ["Date", "Time", "Number"])

        entry1 = reader.next()
        entry2 = reader.next()

        self.assertEqual(entry1["Date"], "20130102")
        self.assertEqual(entry1["Time"], "01:23:45.56")
        self.assertEqual(entry1["Number"], "5")
        self.assertEqual(entry2["Date"], "20120807")
        self.assertEqual(entry2["Time"], "15:30:00.001")
        self.assertEqual(entry2["Number"], "7")
Beispiel #7
0
    def test_subset_with_shapefile_no_ugid(self):
        """Test a subset operation using a shapefile without a UGID attribute."""

        output_format = [constants.OUTPUT_FORMAT_NUMPY, constants.OUTPUT_FORMAT_CSV_SHAPEFILE]

        geom = self.get_shapefile_path_with_no_ugid()
        geom_select_uid = [8, 11]
        geom_uid = 'ID'
        rd = self.test_data.get_rd('cancm4_tas')

        for of in output_format:
            ops = OcgOperations(dataset=rd, geom=geom, geom_select_uid=geom_select_uid, geom_uid=geom_uid, snippet=True,
                                output_format=of)
            self.assertEqual(len(ops.geom), 2)
            ret = ops.execute()
            if of == constants.OUTPUT_FORMAT_NUMPY:
                for element in geom_select_uid:
                    self.assertIn(element, ret)
                self.assertEqual(ret.properties[8].dtype.names, ('STATE_FIPS', 'ID', 'STATE_NAME', 'STATE_ABBR'))
            else:
                with open(ret) as f:
                    reader = DictReader(f)
                    row = reader.next()
                    self.assertIn(geom_uid, row.keys())
                    self.assertNotIn(env.DEFAULT_GEOM_UID, row.keys())

                shp_path = os.path.split(ret)[0]
                shp_path = os.path.join(shp_path, 'shp', '{0}_gid.shp'.format(ops.prefix))
                with fiona.open(shp_path) as source:
                    record = source.next()
                    self.assertIn(geom_uid, record['properties'])
                    self.assertNotIn(env.DEFAULT_GEOM_UID, record['properties'])
def extractThresholdValues(fname):
    # parse csv file and add threshold values as dict
    # this method might be called multiple times for one item

    # There are various formats:
    #   combined.modelEvaluation: Threshold Name, Testing.data, Cutoff,
    #                             Sensitivity, Specificity
    #   biomod2.modelEvaluation: Threshold Name, Testing.data, Cutoff.*,
    #                            Sensitivity.*, Specificity.*
    #   maxentResults.csv: Species,<various columns with interesting values>
    #                <threshold name><space><cumulative threshold,
    #                              logistic threshold,area,training omission>
    # FIXME: this is really ugly and csv format detection should be done
    #        differently
    thresholds = {}
    if fname.endswith("maxentResults.csv"):
        csvfile = open(fname, "r")
        dictreader = DictReader(csvfile)
        row = dictreader.next()
        # There is only one row in maxentResults
        namelist = (
            "Fixed cumulative value 1",
            "Fixed cumulative value 5",
            "Fixed cumulative value 10",
            "Minimum training presence",
            "10 percentile training presence",
            "10 percentile training presence",
            "Equal training sensitivity and specificity",
            "Maximum training sensitivity plus specificity",
            "Balance training omission, predicted area and threshold value",
            "Equate entropy of thresholded and original distributions",
        )
        for name in namelist:
            # We extract only 'cumulative threshold'' values
            threshold = "{} cumulative threshold".format(name)
            thresholds[threshold] = Decimal(row[threshold])
    else:
        # assume it's one of our biomod/dismo results
        csvfile = open(fname, "r")
        dictreader = DictReader(csvfile)
        # search the field with Cutoff
        name = "Cutoff"
        for fieldname in dictreader.fieldnames:
            if fieldname.startswith("Cutoff."):
                name = fieldname
                break
        try:
            for row in dictreader:
                try:
                    thresholds[row[""]] = Decimal(row[name])
                except (TypeError, InvalidOperation) as e:
                    LOG.warn(
                        "Couldn't parse threshold value '%s' (%s) from" "file '%s': %s", name, row[name], fname, repr(e)
                    )
        except KeyError:
            LOG.warn("Couldn't extract Threshold '%s' from file '%s'", name, fname)
    return thresholds
Beispiel #9
0
class CSVUnicodeReader(object):
    def __init__(self, stream):
        self.reader = DictReader(UTF8Encoder(stream))

    def __iter__(self):
        return self

    def next(self):
        entry = self.reader.next()
        return dict([(unicode(k, "utf-8"), unicode(v, "utf-8")) for (k,v) in entry.items()])
def extractThresholdValues(fname):
    # parse csv file and add threshold values as dict
    # this method might be called multiple times for one item

    # There are various formats:
    #   combined.modelEvaluation: Threshold Name, Testing.data, Cutoff,
    #                             Sensitivity, Specificity
    #   biomod2.modelEvaluation: Threshold Name, Testing.data, Cutoff.*,
    #                            Sensitivity.*, Specificity.*
    #   maxentResults.csv: Species,<various columns with interesting values>
    #                <threshold name><space><cumulative threshold,
    #                              logistic threshold,area,training omission>
    # FIXME: this is really ugly and csv format detection should be done
    #        differently
    thresholds = {}
    if fname.endswith('maxentResults.csv'):
        csvfile = open(fname, 'r')
        dictreader = DictReader(csvfile)
        row = dictreader.next()
        # There is only one row in maxentResults
        namelist = (
            'Fixed cumulative value 1', 'Fixed cumulative value 5',
            'Fixed cumulative value 10', 'Minimum training presence',
            '10 percentile training presence',
            '10 percentile training presence',
            'Equal training sensitivity and specificity',
            'Maximum training sensitivity plus specificity',
            'Balance training omission, predicted area and threshold value',
            'Equate entropy of thresholded and original distributions')
        for name in namelist:
            # We extract only 'cumulative threshold'' values
            threshold = '{} cumulative threshold'.format(name)
            thresholds[threshold] = Decimal(row[threshold])
    else:
        # assume it's one of our biomod/dismo results
        csvfile = open(fname, 'r')
        dictreader = DictReader(csvfile)
        # search the field with Cutoff
        name = 'Cutoff'
        for fieldname in dictreader.fieldnames:
            if fieldname.startswith('Cutoff.'):
                name = fieldname
                break
        try:
            for row in dictreader:
                try:
                    thresholds[row['']] = Decimal(row[name])
                except (TypeError, InvalidOperation) as e:
                    LOG.warn("Couldn't parse threshold value '%s' (%s) from"
                             "file '%s': %s",
                             name, row[name], fname, repr(e))
        except KeyError:
            LOG.warn("Couldn't extract Threshold '%s' from file '%s'",
                     name, fname)
    return thresholds
class UnicodeDictReader(object):
    '''A variant of the :class:`csv.DictReader` class that handles Unicode

:param file f: The CSV file to process.
:param list cols: The column-names of the CSV, as strings in a list.
:param string dialect: The CSV dialect. If ``None`` then the dialect will be guessed.
:param string encoding: The encoding of the file. If ``None`` the encoding will be guessed. If
                        guessing fails then UTF-8 will be assumed.'''
    def __init__(self, f, cols, dialect=None, encoding=None, **kwds):
        e = self.guess_encoding(f) if encoding is None else encoding
        d = self.guess_dialect(f) if dialect is None else dialect
        f = UTF8Recoder(f, e)
        self.reader = DictReader(f, cols, dialect=d, **kwds)

    @staticmethod
    def guess_encoding(f):
        detector = UniversalDetector()
        for line in f:
            detector.feed(line)
            if detector.done:
                break
        f.seek(0)  # The above read moves the file-cursor in the CSV file.
        detector.close()
        retval = detector.result['encoding'] if detector.result[
            'encoding'] else 'utf-8'
        return retval

    @staticmethod
    def guess_dialect(f):
        # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**>
        # <https://docs.python.org/2/library/csv.html#csv.Sniffer>
        s = Sniffer()
        try:
            retval = s.sniff(f.read(1024), [
                ',',
                '\t',
            ])  # 1024 taken from the Python docs
        except CSVError:
            retval = 'excel'
        finally:
            f.seek(
                0)  # The above f.read moves the file-cursor in the CSV file.
        return retval

    def next(self):
        row = self.reader.next()
        retval = {
            to_unicode_or_bust(k): to_unicode_or_bust(v)
            for k, v in row.items()
        }
        return retval

    def __iter__(self):
        return self
def upload_waterpoints(filename, skip=0, limit=None):
    """Upload waterpoints from a gzipped CSV file."""
    convert = {
        'date_recorded': lambda s: datetime.strptime(s, '%m/%d/%Y'),
        'population': int,
        'construction_year': lambda s: datetime.strptime(s, '%Y'),
        'breakdown_year': lambda s: datetime.strptime(s, '%Y'),
        'amount_tsh': float,
        'gps_height': float,
        'latitude': float,
        'longitude': float,
    }
    with gzip.open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
            d['facility_code'] = 'wpf001'
            check(add_document('waterpoints', d))
            if limit and i >= limit:
                break
Beispiel #13
0
def upload_waterpoints(filename, skip=0, limit=None):
    """Upload waterpoints from a gzipped CSV file."""
    convert = {
        'date_recorded': lambda s: datetime.strptime(s, '%m/%d/%Y'),
        'population': int,
        'construction_year': lambda s: datetime.strptime(s, '%Y'),
        'breakdown_year': lambda s: datetime.strptime(s, '%Y'),
        'amount_tsh': float,
        'gps_height': float,
        'latitude': float,
        'longitude': float,
    }
    with gzip.open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
            d['facility_code'] = 'wpf001'
            check(add_document('waterpoints', d))
            if limit and i >= limit:
                break
Beispiel #14
0
 def next(self):
     row = DictReader.next(self)
     try:
         processed_row = dict(
             (key, convert(value, self.field_types[key], self.allow_json))
             for key, value in row.iteritems())
     except ValueError as e:
         self.errors.append((e, row))
         if not self.silent:
             raise e
     else:
         self.rows_imported += 1
         return processed_row
Beispiel #15
0
 def next(self):
   row = DictReader.next(self)
   try:
     processed_row = dict(
         (key, convert(value, self.field_types[key]))
         for key, value in row.iteritems()
     )
   except ValueError as e:
     self.errors.append((e, row))
     if not self.silent:
       raise e
   else:
     self.rows_imported += 1
     return processed_row
Beispiel #16
0
class StructuredReader(object):
    def __init__(self, filename, container = None, dialect = 'simplecsv'):
        self._container = None
        if isinstance(container, ObjectContainer):
            self._container = container
            self._reader = DictReader(filename, fieldnames = None, restkey = "restkey", restval = "restval", dialect = dialect)
        elif isinstance(container, TupleContainer) or isinstance(container, ListContainer):
            self._container = container
            self._reader = csv.reader(filename, dialect = dialect)
        else:
            raise Exception("Given container is not valid")

    def next(self):
        # do not treat the header row
        if self._reader.line_num == 0:
            self._reader.next()

        row = self._reader.next()
        return self._container.fetch(row)


    def __iter__(self):
        return self
Beispiel #17
0
def number1():
    filename = '/home/apt9online/src/bslcks/jtest.csv'
    cong = DictReader(open(filename))

    while True:
        p = cong.next()
        print cong.line_num
        if p['Include on directory'] == 'Yes':
          if p['Family relation'] <> 'Duplicate':
            try:
                Person.objects.get(bslc_individual=p['Indiv #'])
                print "%s %s already exists in the DB" % (p['First name'],p['Last name'])
            except:
                record_person(p)
def test_success(field, expected, log_path, response_for, form_data, sm_mock):
    key, value = field
    form_data[key] = value
    assert response_for(form_data, log=False) == expected
    assert sm_mock.call_count == 1

    params = sm_mock.call_args[0][1]['fields']
    assert set(params.keys()) == set(form_data.keys())
    for key, value in form_data.items():
        assert params[key] == value.decode('utf8')

    assert response_for(form_data, log=True) == expected
    assert sm_mock.call_count == 2

    assert response_for(form_data, log=True) == expected
    assert sm_mock.call_count == 3

    with open(log_path) as log_file:
        reader = DictReader(log_file)
        row = reader.next()
        # rows should not be equal because the time field
        # is added by the logging function.
        assert row != reader.next()
Beispiel #19
0
def test_success(field, expected, log_path, response_for, form_data, sm_mock):
    key, value = field
    form_data[key] = value
    assert response_for(form_data, log=False) == expected
    assert sm_mock.call_count == 1

    params = sm_mock.call_args[0][1]['fields']
    assert set(params.keys()) == set(form_data.keys())
    for key, value in form_data.items():
        assert params[key] == value.decode('utf8')

    assert response_for(form_data, log=True) == expected
    assert sm_mock.call_count == 2

    assert response_for(form_data, log=True) == expected
    assert sm_mock.call_count == 3

    with open(log_path) as log_file:
        reader = DictReader(log_file)
        row = reader.next()
        # rows should not be equal because the time field
        # is added by the logging function.
        assert row != reader.next()
Beispiel #20
0
 def next(self):
     # Note: cannot use super because DictReader is an old-style class
     row = DictReader.next(self)
     d = {}
     for key, val in row.items():
         if isinstance(key, str):
             key = key.strip()
         if isinstance(val, str):
             val = val.strip()
             # Try to split by '|' to get a list
             split = val.split("|")
             if len(split) > 1:
                 val = map(str.strip, split)
         d[key] = val
     return d
Beispiel #21
0
class BaseCSVHandler(object):
    defined_input_field_names = ['date','customer','money']
    defined_output_field_names = ['date','customer','money']
    result = []

    def __init__(self, fnm_in='input.csv', fnm_out='output.csv', restkey=None, restval=None,
                  dialect_in="excel", dialect_out="excel"):
        self.f_in = open(fnm_in)
        self.csv_dict_reader = DictReader(self.f_in, restkey=restkey, restval=restval, dialect=dialect_in)
        field_names = self.csv_dict_reader.fieldnames
        if len(field_names) <> len(self.defined_input_field_names):
            raise ValueError,\
                  ("incorrect number of columns in the file %s, it should have %d columns" %
                   (fnm_in, len(self.defined_input_field_names)))
        if [1 for x in zip(field_names,self.defined_input_field_names) if x[0] != x[1]]:
            raise ValueError,\
              ("incorrect names of columns in the file %s, they should be %s" %
              (fnm_in,'"{0}"'.format('","'.join(x for x in self.defined_input_field_names))))
        self.f_out = open(fnm_out, 'w')
        self.csv_dict_writer = DictWriter(self.f_out, self.defined_output_field_names, dialect=dialect_out)

    def __iter__(self):
        return self

    def one_string_handler(self,s):
        if s: self.result.append (s)

    def next(self):
        return self.csv_dict_reader.next()

    def calc_result(self):
        pass

    def write_result(self):
        self.csv_dict_writer.writeheader()
        self.csv_dict_writer.writerows(self.result)

    def close_all_files(self):
        self.f_in.close()
        self.f_out.close()
        self.csv_dict_writer = None
        self.csv_dict_reader = None

    def process_all(self):
        for i in self: self.one_string_handler(i)
        self.calc_result()
        self.write_result()
        self.close_all_files()
Beispiel #22
0
def loadcsv():
    fieldnames = ['left_s0_kp', 'left_s0_ki', 'left_s0_kd',
                  'left_s1_kp', 'left_s1_ki', 'left_s1_kd',
                  'left_e0_kp', 'left_e0_ki', 'left_e0_kd',
                  'left_e1_kp', 'left_e1_ki', 'left_e1_kd',
                  'left_w0_kp', 'left_w0_ki', 'left_w0_kd',
                  'left_w1_kp', 'left_w1_ki', 'left_w1_kd',
                  'left_w2_kp', 'left_w2_ki', 'left_w2_kd',
                  'error']
    with open("simAnnPIDprogress.csv", "rb") as f:
        reader = DictReader(f, fieldnames)
        dictionary = reader.next()
    params = dict2params(dictionary)
    error = dictionary['error']

    return params, error
Beispiel #23
0
    def test_write_csv(self):
        """TestBase: Base::write_csv() creates a valid csv"""
        from csv import DictReader

        fname = "thermal.csv"
        trappy.Run().thermal.write_csv(fname)

        with open(fname) as fin:
            csv_reader = DictReader(fin)

            self.assertTrue("Time" in csv_reader.fieldnames)
            self.assertTrue("temp" in csv_reader.fieldnames)

            first_data = csv_reader.next()
            self.assertEquals(first_data["Time"], "0.0")
            self.assertEquals(first_data["temp"], "68786")
Beispiel #24
0
    def test_write_csv(self):
        """TestBase: Base::write_csv() creates a valid csv"""
        from csv import DictReader

        fname = "thermal.csv"
        trappy.FTrace().thermal.write_csv(fname)

        with open(fname) as fin:
            csv_reader = DictReader(fin)

            self.assertTrue("Time" in csv_reader.fieldnames)
            self.assertTrue("temp" in csv_reader.fieldnames)

            first_data = csv_reader.next()
            self.assertEquals(first_data["Time"], "0.0")
            self.assertEquals(first_data["temp"], "68786")
class UnicodeDictReader(object):
    '''A variant of the :class:`csv.DictReader` class that handles Unicode

:param file f: The CSV file to process.
:param list cols: The column-names of the CSV, as strings in a list.
:param string dialect: The CSV dialect. If ``None`` then the dialect will be guessed.
:param string encoding: The encoding of the file. If ``None`` the encoding will be guessed. If
                        guessing fails then UTF-8 will be assumed.'''
    def __init__(self, f, cols, dialect=None, encoding=None, **kwds):
        e = self.guess_encoding(f) if encoding is None else encoding
        d = self.guess_dialect(f) if dialect is None else dialect
        f = UTF8Recoder(f, e)
        self.reader = DictReader(f, cols, dialect=d, **kwds)

    @staticmethod
    def guess_encoding(f):
        detector = UniversalDetector()
        for line in f:
            detector.feed(line)
            if detector.done:
                break
        f.seek(0)  # The above read moves the file-cursor in the CSV file.
        detector.close()
        retval = detector.result['encoding'] if detector.result['encoding'] else 'utf-8'
        return retval

    @staticmethod
    def guess_dialect(f):
        # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**>
        # <https://docs.python.org/2/library/csv.html#csv.Sniffer>
        s = Sniffer()
        try:
            retval = s.sniff(f.read(1024), [',', '\t', ])  # 1024 taken from the Python docs
        except CSVError:
            retval = 'excel'
        finally:
            f.seek(0)  # The above f.read moves the file-cursor in the CSV file.
        return retval

    def next(self):
        row = self.reader.next()
        retval = {to_unicode_or_bust(k): to_unicode_or_bust(v) for k, v in row.items()}
        return retval

    def __iter__(self):
        return self
Beispiel #26
0
def graceful_read_csv(filename):
	from csv import DictReader

	data = []
	try:
		f = open(filename, 'rb')
	except IOError as e:
		print( "ERROR:", e.strerror )
		exit()

	csvreader = DictReader(f)
	while True:
		try: row = csvreader.next()
		except: break
		data.append(row)

	return data
class UnicodeDictReader(object):
    """
    A CSV reader which will iterate over lines in the CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, cols, dialect="excel", encoding="utf-8", **kwds):
        f = UTF8Recoder(f, encoding)
        self.reader = DictReader(f, cols, dialect=dialect, **kwds)

    def next(self):
        row = self.reader.next()
        retval = {to_unicode_or_bust(k): to_unicode_or_bust(v) for k, v in row.items()}
        return retval

    def __iter__(self):
        return self
Beispiel #28
0
class BLASRm4Reader:
	"""
	BLASR -m 4 -header should generate header:
	qname tname score pctsimilarity 
	qstrand qstart qend qseqseqlength 
	tstrand tstart tend tseqseqlength 
	mapqv ncells clusterScore probscore numSigClusters
	"""
	def __init__(self, filename, flipQS=False):
		self.filename = filename
		self.f = open(filename)
		self.reader = DictReader(self.f, delimiter=' ')
		self.flipQS = flipQS # is True, swap query <-> subject/target
		
	def __iter__(self):
		return self
	
	def next(self):
		d = self.reader.next()
		if len(d) == 0: 
			raise StopIteration, "EOF reached!"
		rec = BLAST9Record(None)
                # trim qID of the last /0_len which is added by BLASR
                #d['qname'] = d['qname'][:d['qname'].rfind('/')]
		if self.flipQS:
			rec.qID = d['tname']
			rec.qStart = int(d['tstart']) # already 0-based
			rec.qEnd = int(d['tend'])
                        rec.qLen = int(d['tseqlength'])
			rec.sID = d['qname']
			rec.sStart = int(d['qstart'])
			rec.sEnd = int(d['qend'])
                        rec.sLen = int(d['qseqlength'])
		else: # query is Q, target is S
			rec.qID = d['qname']
			rec.qStart = int(d['qstart']) # already 0-based
			rec.qEnd = int(d['qend'])
                        rec.qLen = int(d['qseqlength'])
			rec.sID = d['tname']
			rec.sStart = int(d['tstart'])
			rec.sEnd = int(d['tend'])
                        rec.sLen = int(d['tseqlength'])
		rec.strand = '+' if d['qstrand'] == d['tstrand'] else '-'
		rec.identity = float(d['pctsimilarity'])
		return rec
def read_junction_report(filename):
    """
    tab-delimited with header:
           chr     left    right   strand  num_transcript  num_sample      genome  annotation      label

    return: dict of label --> records with that label
    """
    reader = DictReader(open(filename), delimiter='\t')
    r = reader.next()

    cur_label, cur = r['label'], [r]
    for r in reader:
        if r['label'] != cur_label:
            yield cur_label, cur
            cur_label, cur = r['label'], [r]
        else:
            cur.append(r)
    yield cur_label, cur
Beispiel #30
0
class UnicodeReader:
    """ a csv reader which will iterate over lines in the csv file 'f',
        which is encoded in the given encoding;  stolen and adapted from
        from http://docs.python.org/lib/csv-examples.html """

    def __init__(self, f, dialect=excel, encoding='utf-8', **kwargs):
        f = UTF8Recoder(f, encoding)
        self.reader = DictReader(f, dialect=dialect, **kwargs)

    def next(self):
        data = self.reader.next()
        for key, value in data.items():
            if isinstance(value, basestring):
                data[key] = unicode(value, 'utf-8')
        return data

    def __iter__(self):
        return self
def test_append_field_err(form_config, form_data, log_path):
    """ Checks that error logs are correctly written and appended

    Submits three forms, the second two have different fields to the first
    and should be added to the same log file as each other, and be identical
    """
    formmail.log_formdata(form_data, log_path)
    del form_data['email']

    # submit two forms with fields that dont match the config
    # this should append the second form to the error log file
    with pytest.raises(Exception):
        formmail.log_formdata(form_data, log_path)
    with pytest.raises(Exception):
        formmail.log_formdata(form_data, log_path)

    with open(log_path + '_error') as error_log:
        reader = DictReader(error_log)
        assert reader.next() == form_data
        assert reader.next() == form_data
Beispiel #32
0
def test_append_field_err(form_config, form_data, log_path):
    """ Checks that error logs are correctly written and appended

    Submits three forms, the second two have different fields to the first
    and should be added to the same log file as each other, and be identical
    """
    formmail2.log_formdata(form_data, log_path)
    del form_data['email']

    # submit two forms with fields that dont match the config
    # this should append the second form to the error log file
    with pytest.raises(Exception):
        formmail2.log_formdata(form_data, log_path)
    with pytest.raises(Exception):
        formmail2.log_formdata(form_data, log_path)

    with open(log_path + '_error') as error_log:
        reader = DictReader(error_log)
        assert reader.next() == form_data
        assert reader.next() == form_data
Beispiel #33
0
    def clean_csv(self):
        """ ueberprueft, ob eine gueltige CSV-Datei hochgeladen wurde """

        # erster Test: ist der Content-Type gueltig 
        csv = self.cleaned_data['csv']
        if csv.content_type != 'text/csv':
            self._errors['csv'] = self.error_class(['Nur CSV-Dateien sind als Eingabe erlaubt!'])
            return csv
    
        # zweiter Test: hat die Datei die richtige Anzahl Spalten?
        reader = DictReader(csv)
        try:
            entry = reader.next()
            if len(entry) != 12:
                msg = 'Ungültiges Format der CSV-Datei (falsche Anzahl Spalten)!'
                self._errors['csv'] = self.error_class([msg])
        except StopIteration:
            msg = 'Ungültiges Format der CSV-Datei (keine Bestellungen vorhanden)!'
            self._errors['csv'] = self.error_class([msg])
        orders = [entry] + [row for row in reader]
        return orders
Beispiel #34
0
 def next(self):
     ret = self.csvAbleClass()
     for (csvField, value) in DictReader.next(self).iteritems():
         if isinstance(value, str):
             value = value.decode(self.encoding)
             try:
                 value = value.encode('ascii')
             except UnicodeError:
                 pass
         field = self.fieldsDict.get(csvField) if self.fieldsDict and csvField != self.csvAbleClass.READ_REST_KEY else csvField
         if field:
             setattr(ret, field, value)
         elif value is not None:
             rest = getattr(ret, self.csvAbleClass.READ_REST_KEY, None)
             if rest is None:
                 rest = value if field == self.csvAbleClass.READ_REST_KEY else {csvField: value}
                 setattr(ret, self.csvAbleClass.READ_REST_KEY, rest)
             elif field == self.csvAbleClass.READ_REST_KEY:
                 rest.update(value)
             else:
                 rest[csvField] = value # pylint: disable=E1137
     return ret
Beispiel #35
0
def dictcsv(csvname, fieldnames = None, arrays = False):
    """Reading csv files into a dictionary.

    Arguments:
    csvname: string filename
    
    Keyword Arguments:
    fieldnames: list of csv column names.  If none, first column of the file
                being read will be used.
    arrays: Whether or not to return csv contents as a dict of arrays

    Returns: 
    dictionary of columns as numpy arrays, keys are fieldnames    
    """

    fileobj = open(csvname, 'rU')
    DR = DictReader(fileobj, fieldnames = fieldnames)
    
    fields = DR.fieldnames
    l = DR.next()
    dicty = {}
    for f in fields:
        try:
            dicty[f] = [float(l[f])]
        except (TypeError, ValueError):
            dicty[f] = [l[f]]
    for row in DR:
        for f in fields:
            try:
                dicty[f].append(float(row[f]))
            except (TypeError, ValueError):
                dicty[f].append(row[f])
    if arrays:
        for key in dicty:
            dicty[key] = np.array(dicty[key])
            
    return dicty
Beispiel #36
0
def boundary_values(csvFile, header):
    """
    csvFile - directory of csvFile to be read
    header - one of the numerical headers in the original csv file, assume valid

    returns [min value, max value] in the csv file given the header
    """

    f = open(csvFile)
    reader = DictReader(f)

    firstLine = reader.next()
    minValue = maxValue = value_of(firstLine, header)

    for row in reader:
        value = value_of(row, header)
        if value < minValue:
            # new minimum
            minValue = value
        elif value > maxValue:
            # new maximum
            maxValue = value

    return [minValue, maxValue]
class Cincinnati311CSVDataParser(object):
    """ Class that parses and cleans a Cincinnati 311 Comma Seperated Value
    (CSV) file record

    Data set description:
    --------------------
    https://data.cincinnati-oh.gov/Thriving-Healthy-Neighborhoods/
        Cincinnati-311-Non-Emergency-Service-Requests/4cjh-bm8b/about"""
    def __init__(self, h_file):
        """ Cincinnati311CSVDataParser class constructor

        Args:
            self: Cincinnati311CSVDataParser class object handle

            h_file: Cincinnati 311 csv file handle

        Returns:
            None"""
        fieldnames = [
            'jurisdictionid', 'servicerequestid', 'status', 'statusnotes',
            'servicename', 'servicecode', 'description', 'agencyresponsible',
            'servicenotice', 'requesteddatetime', 'updateddatetime',
            'expecteddatetime', 'address', 'addressid', 'zipcode', 'latitude',
            'longitude', 'requesteddate', 'updateddate', 'lasttableupdate'
        ]

        matchobj = re.compile('.*date.*')

        self.date_fields = filter(lambda elem: matchobj.match(elem) != None,
                                  fieldnames)

        self.string_fields = filter(lambda elem: matchobj.match(elem) == None,
                                    fieldnames)

        # http://stackoverflow.com/questions/265960/
        #   best-way-to-strip-punctuation-from-a-string-in-python
        self.punctuation_table = table = string.maketrans("", "")

        self.readerobj = DictReader(h_file, fieldnames)

    def __iter__(self):
        """ Iterator
        :return: None
        """
        return self

    def next(self):
        """ Parses a Cincinnati 311 CSV file record

        http://stackoverflow.com/questions/19151/how-to-make-class-iterable

        Args:
            self: Cincinnati311CSVDataParser class object handle

        Returns:
            record: Dictionary that stores a Cincinnati 311 data CSV file
                    record"""
        record = self.readerobj.next()

        if record['jurisdictionid'] == 'JURISDICTION_ID':
            record = None
        else:
            for key in self.date_fields:
                if len(record[key]) > 0:
                    record[key] = parser.parse(record[key])

            for key in self.string_fields:
                record[key] = re.sub("\"", "", record[key].lower())

            try:
                record['zipcode'] = int(record['zipcode'])
            except ValueError:
                record['zipcode'] = None

            for key in ['latitude', 'longitude']:
                try:
                    record[key] = float(record[key])
                except ValueError:
                    record[key] = None

            if len(record['servicecode']) > 0:
                record['servicecode'] =\
                    re.sub("\s+", "", record['servicecode'])

                record['servicecode'] =\
                    record['servicecode'].translate(self.punctuation_table,
                                                    string.punctuation)

                record['servicename'] =\
                    record['servicename'].translate(self.punctuation_table,
                                                    string.punctuation)

        return record
Beispiel #38
0
class KnownGeneFile(SmartFileIter):
    '''An iterable that parses UCSC's KnownGene gene annotation files.  Field 
    names are::

        FIELD_NAMES = [ 'name',
                        'chrom',
                        'strand',
                        'txStart',
                        'txEnd',
                        'cdsStart',
                        'cdsEnd',
                        'exonCount',
                        'exonStarts',
                        'exonEnds',
                        'proteinID',
                        'alignID',
                      ]
'''

    FIELD_NAMES = [
        'name',
        'chrom',
        'strand',
        'txStart',
        'txEnd',
        'cdsStart',
        'cdsEnd',
        'exonCount',
        'exonStarts',
        'exonEnds',
        'proteinID',
        'alignID',
    ]

    # function pointers for correct formatting of field names
    FIELD_TYPES = [
        str,
        str,
        str,
        int,
        int,
        int,
        int,
        lambda x: [int(y) for y in x.split(',') if len(y) > 0],
        lambda x: [int(y) for y in x.split(',') if len(y) > 0],
        lambda x: [int(y) for y in x.split(',') if len(y) > 0],
        str,
        str,
    ]

    def __init__(self, kg_fn):
        self.meta_data = []
        self.file_info = {}
        f = open(kg_fn, 'rU')
        self._dict_reader = DictReader(filter(lambda row: row[0] != '#', f),
                                       delimiter='\t',
                                       fieldnames=KnownGeneFile.FIELD_NAMES)

    def __iter__(self):
        return self

    def next(self):
        line = self._dict_reader.next()
        for k, f in zip(self.FIELD_NAMES, self.FIELD_TYPES):
            line[k] = f(line[k])
        return line
Beispiel #39
0
class SmartFileIter:
    r"""An 'abstract' class implementing a smart file iterator.  It is essentially
    a wrapper around a collections.DictReader object that parses fields into
    Python datatypes (int, float, tuple, objects, etc) as they are iterated.
    The constructor argument *f* can be either a valid filename or a file-like
    object.  This class should not be directly instantiated - rather it should
    be subclassed with FIELD_NAMES and FIELD_TYPES defined.  FIELD_NAMES is a
    list of strings referring to the names of the fields, FIELD_TYPES is a list
    of the same length of callables that will parse the column into the desired
    format. Example::
    
      >>> s = StringIO('chr1\t0\t100\t+\nchr3\t300\t601\t-\n')
      >>> class IntervalFile(SmartFileIter):
              r'''A SmartFileIter for files with lines formatted like:
                    chrom\tstart\tend\tstrand'''
              FIELD_NAMES = ['chrom','start','end','strand']
              FIELD_TYPES= [str,int,int,lambda x: 0 if x == '+' else 1]
      >>> f = IntervalFile(s)
      >>> for r in f :
              print r['chrom'], 'length: ', r['end']-r['start'], 'strand: ',r['strand']

    ``r['start']`` and ``r['end']`` are automatically available as integers,
    so the subraction works as expected.  Arbitrary functions that accept a
    single argument and return a value may also be specified.
    """
    def __init__(self, f, skip_line_chars='#'):
        if not hasattr(self, 'FIELD_NAMES') or not hasattr(
                self, 'FIELD_TYPES'):
            raise Exception(
                'Subclasses must define class members FIELD_NAMES and FIELD_TYPES'
            )
        if isinstance(f, str):
            f = open(f, 'rU')
        self._dict_reader = DictReader(filter(lambda row: row[0] != '#', f),
                                       delimiter='\t',
                                       fieldnames=self.FIELD_NAMES)
        self.fieldnames = self.FIELD_NAMES
        self.curr_line = self._dict_reader.next()
        self.skip_line_chars = skip_line_chars

        # skip initial comment lines
        while self.curr_line[self.FIELD_NAMES[0]][0] in self.skip_line_chars:
            self.curr_line = self._dict_reader.next()

        if self.FIELD_NAMES[0] in self.curr_line.values():
            self.curr_line = self._dict_reader.next()

    def __iter__(self):
        return self

    def __getattr__(self, attr):
        try:
            return self.__dict__[attr]
        except KeyError:
            return getattr(self._dict_reader, attr)

    def next(self):
        """Emit the next record in the file as a dictionary with parsed values"""

        if self.curr_line is None:
            raise StopIteration()

        line = self.curr_line

        # check for comment
        while line[self.FIELD_NAMES[0]][0] in self.skip_line_chars:
            line = self.curr_line = self._dict_reader.next()

        for k, f in zip(self.FIELD_NAMES, self.FIELD_TYPES):
            try:
                line[k] = f(line[k])
            except Exception, e:
                #sys.stderr.write('Warning: field %s on line %d could not be properly formatted, exception %s\n'%(k,self._dict_reader.reader.line_num,str(e)))
                line[k] = line[k]

        try:
            self.curr_line = self._dict_reader.next()
        except StopIteration:
            self.curr_line = None

        return line
#!/usr/bin/env fab
from csv import DictReader
from codecs import open as uopen
from json import loads, dumps
from string import ascii_letters, punctuation
from fabric.api import *
from cStringIO import StringIO
from os import urandom
keyFd = DictReader(
    uopen("/Users/ss/keys/aliyun_key.csv", 'r', encoding='utf-8-sig'))
d = keyFd.next()

env.user = '******'
env.region = 'ap-southeast-1'
env.key_filename = ['/Users/ss/keys/ralali_production_key.pem']
env.access_key = d['AccessKeyId']
env.access_secret = d['AccessKeySecret']
env.key_pair = 'default_aliyun'
env.instance = 'ecs.n1.small'
env.zone = 'a'
env.imageid = 'ubuntu_16_0402_64_20G_alibase_20171227.vhd'
env.wp_tarball = 'http://wordpress.org/latest.tar.gz'
env.domain = 'test-aliyun.wordpress'
env.dbname = 'test_aliyun_db'


@task
def provision_ecs():
    instance_details = local("aliyuncli ecs CreateInstance --AccessKeyId %s --AccessKeySecret %s --KeyPairName %s --RegionId %s --InstanceType %s --ImageId %s" % \
      (env.access_key, env.access_secret, env.key_pair, env.region, env.instance, env.imageid))
    env.ecs_instance = loads(instance_details)['InstanceId']
class Cincinnati311CSVDataParser(object):
    """ Class that parses and cleans a Cincinnati 311 Comma Seperated Value
    (CSV) file record

    Data set description:
    --------------------
    https://data.cincinnati-oh.gov/Thriving-Healthy-Neighborhoods/
        Cincinnati-311-Non-Emergency-Service-Requests/4cjh-bm8b/about"""

    def __init__(self,
                 h_file):
        """ Cincinnati311CSVDataParser class constructor

        Args:
            self: Cincinnati311CSVDataParser class object handle

            h_file: Cincinnati 311 csv file handle

        Returns:
            None"""
        fieldnames = ['jurisdictionid',
                      'servicerequestid',
                      'status',
                      'statusnotes',
                      'servicename',
                      'servicecode',
                      'description',
                      'agencyresponsible',
                      'servicenotice',
                      'requesteddatetime',
                      'updateddatetime',
                      'expecteddatetime',
                      'address',
                      'addressid',
                      'zipcode',
                      'latitude',
                      'longitude',
                      'requesteddate',
                      'updateddate',
                      'lasttableupdate']

        matchobj = re.compile('.*date.*')

        self.date_fields = filter(lambda elem: matchobj.match(elem) != None,
                                  fieldnames)

        self.string_fields = filter(lambda elem: matchobj.match(elem) == None,
                                    fieldnames)

        # http://stackoverflow.com/questions/265960/
        #   best-way-to-strip-punctuation-from-a-string-in-python
        self.punctuation_table = table = string.maketrans("", "")

        self.readerobj = DictReader(h_file, fieldnames)

    def __iter__(self):
        """ Iterator
        :return: None
        """
        return self

    def next(self):
        """ Parses a Cincinnati 311 CSV file record

        http://stackoverflow.com/questions/19151/how-to-make-class-iterable

        Args:
            self: Cincinnati311CSVDataParser class object handle

        Returns:
            record: Dictionary that stores a Cincinnati 311 data CSV file
                    record"""
        record = self.readerobj.next()

        if record['jurisdictionid'] == 'JURISDICTION_ID':
            record = None
        else:
            for key in self.date_fields:
                if len(record[key]) > 0:
                    record[key] = parser.parse(record[key])

            for key in self.string_fields:
                record[key] = re.sub("\"", "", record[key].lower())

            try:
                record['zipcode'] = int(record['zipcode'])
            except ValueError:
                record['zipcode'] = None

            for key in ['latitude', 'longitude']:
                try:
                    record[key] = float(record[key])
                except ValueError:
                    record[key] = None

            if len(record['servicecode']) > 0:
                record['servicecode'] =\
                    re.sub("\s+", "", record['servicecode'])

                record['servicecode'] =\
                    record['servicecode'].translate(self.punctuation_table,
                                                    string.punctuation)

                record['servicename'] =\
                    record['servicename'].translate(self.punctuation_table,
                                                    string.punctuation)

        return record
Beispiel #42
0
    def _import_from_csv(cls, path):
        """Populate the DB from CSV data.

        The 'path' attribute is the absolute path to the CSV file that must be
        inserted on the database.
        """
        atualizadas = []
        novas = []
        errors = []

        fieldnames = [
            'remove1', 'nome_unidade', 'sigla_unidade', 'tipo_logradouro',
            'nome_logradouro', 'numero', 'complemento', 'bairro', 'municipio',
            'uf', 'cep', 'ddd', 'telefone', 'email', 'remove2'
        ]
        with open(path, 'r') as csv_file:
            data = DictReader(csv_file, fieldnames=fieldnames)

            for row in data:
                if row['nome_unidade'] == 'nome_unidade':
                    row = data.next()

                del row['remove1']
                del row['remove2']

                try:
                    unidade = UnidadePrisional.objects.get(
                        nome_unidade=row['nome_unidade'],
                        municipio=Cidade.objects.get(nome=row['municipio'],
                                                     estado=row['uf']))
                    unidade._update_from_dict(row)
                    unidade.save()
                    atualizadas.append(unidade.nome_unidade)
                except ObjectDoesNotExist:
                    try:
                        unidade = UnidadePrisional._new_from_dict(row)
                        unidade.save()
                        novas.append(unidade.nome_unidade)
                    except Exception as e:
                        error = {
                            'nome_unidade': row['nome_unidade'],
                            'erro': str(e),
                            'data': row
                        }
                        errors.append(error)

        msg = 'Resumo da operação:\n'
        if atualizadas:
            msg += '    - '
            msg += '{} unidades foram atualizadas.\n'.format(len(atualizadas))
            log.info('    {}'.format(atualizadas))

        if novas:
            msg += '    - '
            msg += '{} unidades foram adicionadas.\n'.format(len(novas))

        if errors:
            msg += 'Ocorreram {} erros de importação:\n'.format(len(errors))
            for error in errors:
                msg += '    - '
                msg += 'Unidade: {:.30}'.format(error['nome_unidade'])
                msg += ' | {} | {}/{}\n'.format(error['erro'],
                                                error['data']['uf'],
                                                error['data']['municipio'])

        log.info(msg)
Beispiel #43
0
    def meta_schema(self):
        from csv import DictReader
        from collections import defaultdict
        import yaml

        self.database.create()

        config = dict(self.metadata.build.config)
        url = self.metadata.build.sources['table_map'].format(**config)

        tn = self.filesystem.download(url)

        current_table = None
        t = None

        # These tables spread across more than one segment,
        # which is a difficult special case, so these tables
        # are re-named to have the segment number as a suffix.
        large_tables = [
            'B24121', 'B24122', 'B24123', 'B24124', 'B24125', 'B24126'
        ]

        table_segments = defaultdict(list)

        lr = self.init_log_rate(1000)

        with self.session, open(tn) as f:
            reader = DictReader(f)

            for i, row in enumerate(reader):

                if self.run_args.test and i > 500:
                    break

                if row['Table ID'] in large_tables:
                    row['Table ID'] = (row['Table ID'] + '_' +
                                       str(int(row['Sequence Number'])))

                #### These are gouping lines that have no data
                #### associated with them.
                if row['Line Number'].endswith('.5'):
                    continue

                col_data = {'segment': int(row['Sequence Number'])}

                if row['Table ID'] != current_table:
                    #
                    # A New Table
                    #
                    new_table = True
                    current_table = row['Table ID']

                    # The row after the table is the universe
                    universe = reader.next()['Table Title']
                    if not universe.startswith('Universe:'):
                        raise Exception("Universe fail")
                    else:
                        parts = universe.split(':')
                        universe = parts[1].strip()

                    t = self.schema.add_table(
                        current_table,
                        description=row['Table Title'].title(),
                        universe=universe,
                        keywords=row['Subject Area'],
                        data={
                            'segment': int(row['Sequence Number']),
                            'start': int(row['Start Position'])
                        })

                    if not current_table in table_segments[
                            row['Sequence Number']]:
                        (table_segments[int(
                            row['Sequence Number'])].append(current_table))

                    ac = self.schema.add_column

                    is1 = 'i1'

                    # Flag to mark which columns should be removed from the table when constructing
                    # a segment header.
                    link_data = dict(col_data.items())
                    link_data['is_link'] = 1

                    ac(t,
                       'id',
                       datatype='integer',
                       is_primary_key=True,
                       description=row['Table Title'].title())
                    ac(t,
                       'FILEID',
                       datatype='varchar',
                       size=6,
                       data=link_data,
                       description='Universe: {}'.format(universe))
                    ac(t,
                       'FILETYPE',
                       datatype='varchar',
                       size=6,
                       data=link_data)
                    ac(t,
                       'STUSAB',
                       datatype='varchar',
                       size=2,
                       data=link_data,
                       indexes=is1)
                    ac(t,
                       'CHARITER',
                       datatype='varchar',
                       size=3,
                       data=link_data)
                    ac(t,
                       'SEQUENCE',
                       datatype='varchar',
                       size=4,
                       data=link_data)
                    ac(t,
                       'LOGRECNO',
                       datatype='integer',
                       size=7,
                       data=link_data,
                       indexes=is1)

                else:
                    #
                    # A row for an existing table.
                    #

                    try:
                        int(row['Line Number'])
                    except:
                        print "Failed for ", row

                    name = "{}{:03d}".format(current_table,
                                             int(row['Line Number']))

                    self.schema.add_column(
                        t,
                        name,
                        datatype='integer',
                        description=(row['Table Title'].decode('latin1')),
                        data=col_data)

                lr("Creating schema: {}".format(t.name))
                last_table = row['Table ID']
                new_table = False

        with open(self.filesystem.path('meta', 'tables.yaml'), 'w') as f:
            f.write(
                yaml.dump(dict(table_segments),
                          indent=4,
                          default_flow_style=False))

        with open(self.filesystem.path('meta', self.SCHEMA_FILE), 'w') as f:
            self.schema.as_csv(f)

        return True
Beispiel #44
0
    print "-    Parallel downloads: %s" % (threads)
    print "-----------------------------------------------------------------------"
    for i in range(2):
        print " "

    if not path.exists(destfolder):
        makedirs(destfolder)
    done = glob(path.join(destfolder, '*.jpg'))

    filename = 'photo_links'
    if path.exists(filename):
        op = open(filename, 'r')
        reader = DictReader(op)
        urls = [row['url_original'] for row in reader]
        op.seek(0)
        reader.next()
        names = [row['name'] for row in reader]
        op.close()

        print '-      Checking which pictures are not already downloaded.        -'
        print '-                                                                 -'
        print '-           It may take some time, or not, who knows.             -'
        tengui = []
        for el, name in enumerate(names):
            if name in [d.split('/')[-1].split('.')[0] for d in done]:
                tengui.append(el)

        names = [i for j, i in enumerate(names) if j not in tengui]
        urls = [i for j, i in enumerate(urls) if j not in tengui]
    else:
        print 'Need picture links. Go find photo_links on my github  ;) https://github.com/dieguico/Project_Apollo_Archive'
Beispiel #45
0
Import data from CSV file to MongoDB.
Author: Fabio Pani <fabiux AT fabiopani DOT it>
License: GNU/GPL version 3 (see file LICENSE)
"""
from sys import argv
from csv import DictReader, QUOTE_NONNUMERIC
from lib.utils import csv_fieldnames, convert_row_for_mongo
from pymongo import MongoClient

if __name__ != '__main__':
    exit()

if len(argv) < 2:
    print('Usage: python import_from_csv.py <csv_file>')
    exit()

eq = MongoClient().ingv.earthquakes

with open(argv[1], 'rb') as f:
    reader = DictReader(f,
                        fieldnames=csv_fieldnames,
                        quotechar='"',
                        quoting=QUOTE_NONNUMERIC)
    reader.next()  # skip header
    for event in reader:
        try:
            eq.insert_one(convert_row_for_mongo(event))
        except Exception as e:
            pass
f.close()
Beispiel #46
0
class SmartFileIter :
    r"""An 'abstract' class implementing a smart file iterator.  It is essentially
    a wrapper around a collections.DictReader object that parses fields into
    Python datatypes (int, float, tuple, objects, etc) as they are iterated.
    The constructor argument *f* can be either a valid filename or a file-like
    object.  This class should not be directly instantiated - rather it should
    be subclassed with FIELD_NAMES and FIELD_TYPES defined.  FIELD_NAMES is a
    list of strings referring to the names of the fields, FIELD_TYPES is a list
    of the same length of callables that will parse the column into the desired
    format. Example::
    
      >>> s = StringIO('chr1\t0\t100\t+\nchr3\t300\t601\t-\n')
      >>> class IntervalFile(SmartFileIter):
              r'''A SmartFileIter for files with lines formatted like:
                    chrom\tstart\tend\tstrand'''
              FIELD_NAMES = ['chrom','start','end','strand']
              FIELD_TYPES= [str,int,int,lambda x: 0 if x == '+' else 1]
      >>> f = IntervalFile(s)
      >>> for r in f :
              print r['chrom'], 'length: ', r['end']-r['start'], 'strand: ',r['strand']

    ``r['start']`` and ``r['end']`` are automatically available as integers,
    so the subraction works as expected.  Arbitrary functions that accept a
    single argument and return a value may also be specified.
    """

    def __init__(self,f,skip_line_chars='#') :
        if not hasattr(self,'FIELD_NAMES') or not hasattr(self,'FIELD_TYPES') :
            raise Exception('Subclasses must define class members FIELD_NAMES and FIELD_TYPES')
        if isinstance(f,str) :
            f = open(f)
        self._dict_reader = DictReader(f,delimiter='\t',fieldnames=self.FIELD_NAMES)
        self.fieldnames = self.FIELD_NAMES
        self.curr_line = self._dict_reader.next()
        self.skip_line_chars = skip_line_chars

        # skip initial comment lines
        while self.curr_line[self.FIELD_NAMES[0]][0] in self.skip_line_chars :
            self.curr_line = self._dict_reader.next()

        if self.FIELD_NAMES[0] in self.curr_line.values() :
            self.curr_line = self._dict_reader.next()

    def __iter__(self) :
        return self

    def __getattr__(self,attr) :
        try:
            return self.__dict__[attr]
        except KeyError :
            return getattr(self._dict_reader,attr)

    def next(self) :
        """Emit the next record in the file as a dictionary with parsed values"""

        if self.curr_line is None :
            raise StopIteration()

        line = self.curr_line

        # check for comment
        while line[self.FIELD_NAMES[0]][0] in self.skip_line_chars :
            line = self.curr_line = self._dict_reader.next()

        for k,f in zip(self.FIELD_NAMES, self.FIELD_TYPES) :
            try :
                line[k] = f(line[k])
            except Exception, e :
                #sys.stderr.write('Warning: field %s on line %d could not be properly formatted, exception %s\n'%(k,self._dict_reader.reader.line_num,str(e)))
                line[k] = line[k]

        try :
            self.curr_line = self._dict_reader.next()
        except StopIteration :
            self.curr_line = None

        return line
    print "-    Destination folder: %s"%(destfolder)
    print "-    Parallel downloads: %s"%(threads)
    print "-----------------------------------------------------------------------"
    for i in range(2): print " "

    if not path.exists(destfolder):
        makedirs(destfolder)
    done = glob(path.join(destfolder,'*.jpg'))

    filename='photo_links'
    if path.exists(filename):
        op = open(filename, 'r')
        reader=DictReader(op)
        urls=[row['url_original'] for row in reader]
        op.seek(0)
        reader.next()
        names=[row['name'] for row in reader]
        op.close()
    
        print '-      Checking which pictures are not already downloaded.        -'
        print '-                                                                 -'
        print '-           It may take some time, or not, who knows.             -'
        tengui=[]
        for el,name in enumerate(names):
            if name in [d.split('/')[-1].split('.')[0] for d in done]:
                tengui.append(el)
            
        names = [i for j, i in enumerate(names) if j not in tengui]
        urls = [i for j, i in enumerate(urls) if j not in tengui]
    else:
        print 'Need picture links. Go find photo_links on my github  ;) https://github.com/dieguico/Project_Apollo_Archive'
Beispiel #48
0
class KnownGeneFile(SmartFileIter) :
    '''An iterable that parses UCSC's KnownGene gene annotation files.  Field 
    names are::

        FIELD_NAMES = [ 'name',
                        'chrom',
                        'strand',
                        'txStart',
                        'txEnd',
                        'cdsStart',
                        'cdsEnd',
                        'exonCount',
                        'exonStarts',
                        'exonEnds',
                        'proteinID',
                        'alignID',
                      ]
'''

    FIELD_NAMES = [ 'name',
                    'chrom',
                    'strand',
                    'txStart',
                    'txEnd',
                    'cdsStart',
                    'cdsEnd',
                    'exonCount',
                    'exonStarts',
                    'exonEnds',
                    'proteinID',
                    'alignID',
                  ]

    # function pointers for correct formatting of field names
    FIELD_TYPES = [ str,
                    str,
                    str,
                    int,
                    int,
                    int,
                    int,
                    lambda x: [int(y) for y in x.split(',') if len(y) > 0],
                    lambda x: [int(y) for y in x.split(',') if len(y) > 0],
                    lambda x: [int(y) for y in x.split(',') if len(y) > 0],
                    str,
                    str,
                  ]

    def __init__(self,kg_fn) :
        self.meta_data = []
        self.file_info = {}
        f = open(kg_fn)
        self._dict_reader = DictReader(f,delimiter='\t',fieldnames=KnownGeneFile.FIELD_NAMES)

    def __iter__(self) :
        return self

    def next(self) :
        line = self._dict_reader.next()
        for k,f in zip(self.FIELD_NAMES,self.FIELD_TYPES) :
            line[k] = f(line[k])
        return line
Beispiel #49
0
def solr_retransform(fname, start_time, feed_file_size):
    """Create Solr-compatible versions of a datafile"""
    numopps = 0

    print_progress("Creating Solr transformed file for: " + fname)
    out_filename = fname + ".transformed"
    data_file = open(fname, "r")
    try:
        csv_reader = DictReader(data_file, dialect="our-dialect")
        csv_reader.next()
    except:
        print data_file.read()
        print_progress("error processing %s" % str(fname))
        return

    shortname = footprint_lib.guess_shortname(fname)
    if not shortname:
        shortname = fname

    fnames = csv_reader.fieldnames[:]
    fnames.append("c:eventrangestart:dateTime")
    fnames.append("c:eventrangeend:dateTime")
    fnames.append("c:eventduration:integer")
    fnames.append("c:aggregatefield:string")
    fnames.append("c:dateopportunityidgroup:string")
    fnames.append("c:randomsalt:float")
    fnamesdict = dict([(x, x) for x in fnames])

    data_file = open(fname, "r")
    # TODO: Switch to TSV - Faster and simpler
    csv_reader = DictReader(data_file, dialect="our-dialect")
    csv_writer = DictWriter(open(out_filename, "w"), dialect="excel-tab", fieldnames=fnames)
    for field_name in fnamesdict.keys():
        fnamesdict[field_name] = fnamesdict[field_name].lower()
        if fnamesdict[field_name].startswith("c:"):
            fnamesdict[field_name] = fnamesdict[field_name].split(":")[1]

    csv_writer.writerow(fnamesdict)
    now = parser.parse(commands.getoutput("date"))
    today = now.date()
    expired_by_end_date = num_bad_links = 0
    for rows in csv_reader:
        if rows["title"] and rows["title"].lower().find("anytown museum") >= 0:
            # bogus event
            continue

        if not "c:OpportunityID:string" in rows:
            continue

        # Split the date range into separate fields
        # event_date_range can be either start_date or start_date/end_date
        split_date_range = []
        if rows["event_date_range"]:
            split_date_range = rows["event_date_range"].split("/")

        if split_date_range:
            rows["c:eventrangestart:dateTime"] = split_date_range[0]
            if len(split_date_range) > 1:
                rows["c:eventrangeend:dateTime"] = split_date_range[1]
            else:
                if rows["c:openended:boolean"] == "Yes":
                    rows["c:eventrangeend:dateTime"] = rows["c:expires:dateTime"]
                else:
                    rows["c:eventrangeend:dateTime"] = rows["c:eventrangestart:dateTime"]

        # in case we somehow got here without already doing this
        rows["title"] = footprint_lib.cleanse_snippet(rows["title"])
        rows["description"] = footprint_lib.cleanse_snippet(rows["description"])
        rows["c:detailURL:URL"] = rows["c:detailURL:URL"].replace("&amp;", "&")
        if not rows["c:detailURL:URL"].lower().startswith("http"):
            rows["c:detailURL:URL"] = "http://" + rows["c:detailURL:URL"]

        link = str(rows["c:detailURL:URL"])
        if link in BAD_LINKS or check_links.is_bad_link(link, RECHECK_BAD_LINKS):
            num_bad_links += 1
            footprint_lib.feed_report(rows["c:OpportunityID:string"], "badlinks", shortname, link)
            dlink = "'" + str(link) + "'"
            if dlink not in BAD_LINKS:
                BAD_LINKS[dlink] = 0
                print_progress("bad link: " + dlink)
            BAD_LINKS[dlink] += 1
            continue

        rows["c:org_missionStatement:string"] = footprint_lib.cleanse_snippet(rows["c:org_missionStatement:string"])
        rows["c:org_description:string"] = footprint_lib.cleanse_snippet(rows["c:org_description:string"])

        rows["c:aggregatefield:string"] = footprint_lib.cleanse_snippet(
            " ".join(
                [
                    rows["title"],
                    rows["description"],
                    rows["c:provider_proper_name:string"],
                    rows.get("c:skills:string", rows.get("c:skill:string", "")),
                    rows.get("c:categoryTags:string", rows.get("c:categoryTag:string", "")),
                    rows["c:org_name:string"],
                    rows["c:eventName:string"],
                ]
            )
        )

        ids = rows.get("c:OpportunityID:string", rows.get("c:opportunityID:string", "OpportunityID"))
        ds = str(rows.get("c:eventrangestart:dateTime", "2001"))
        if ds.find("T") > 0:
            ds = ds.split("T")[0]
        rows["c:dateopportunityidgroup:string"] = "".join([ds, ids])

        for key in rows.keys():
            if key.find(":dateTime") != -1:
                if rows[key].find(":") > 0:
                    rows[key] += "Z"
            elif key.find(":integer") != -1:
                if rows[key] == "":
                    rows[key] = 0
                else:
                    # find the first numbers from the string, e.g. abc123.4 => 123
                    try:
                        rows[key] = int(re.sub(r"^.*?([0-9]+).*$", r"\1", rows[key]))
                    except:
                        print_progress("error parsing rows[key]=%s -- rejecting record." % str(rows[key]))
                        continue

        try:
            start_date = parser.parse(rows["c:eventrangestart:dateTime"], ignoretz=True)
        except:
            start_date = "2001-01-01T00:00:00"

        try:
            end_date = parser.parse(rows["c:eventrangeend:dateTime"], ignoretz=True)
        except:
            end_date = "2020-12-31T23:59:59"

        try:
            # check for expired opportunities
            delta_days = get_delta_days(relativedelta.relativedelta(end_date, today))
            if delta_days < -2 and delta_days > -3000:
                # more than 3000? it's the 1971 thing
                # else it expired at least two days ago
                footprint_lib.feed_report(rows["c:OpportunityID:string"], "expired", shortname, link)
                expired_by_end_date += 1
                continue

            duration_rdelta = relativedelta.relativedelta(end_date, start_date)
            duration_delta_days = get_delta_days(duration_rdelta)

            # Check whether start/end dates are the wrong way around.
            if duration_delta_days < 0:
                # removing this code for now-- too scary wrt. typos
                # e.g. what happens if 9/11/2009 - 9/7/2009  and it turns out
                # that the 7 was supposed to be a 17 i.e. simple typo-- by
                # swapping you've made it worse.  Correct solution is to add
                # to spreadsheet checker, then reject start>end here.
                # even this is the wrong place to do this-- should apply to
                # both Base and SOLR.
                # print_progress('Date error: start > end. Swapping dates...')
                # duration_delta_days = -duration_delta_days
                # temp = rows["c:eventrangestart:dateTime"]
                # rows["c:eventrangestart:dateTime"] = rows["c:eventrangeend:dateTime"]
                # rows["c:eventrangeend:dateTime"] = temp
                print_progress("start date after end date: rejecting record.")
                continue

            # Fix for events that are ongoing or whose dates were unsucessfully
            # parsed. These events have start and end dates on 0000-01-01.
            #
            # These events get a large eventduration (used for ranking) so that
            # they are not erroneously boosted for having a short duration.
            current_rdelta = relativedelta.relativedelta(today, end_date)
            current_delta_days = get_delta_days(current_rdelta)
            rows["c:eventduration:integer"] = max(duration_delta_days, current_delta_days)
        except:
            pass

        # GBASE LEGACY: Fix to the +1000 to lat/long hack
        if not rows["c:latitude:float"] is None and float(rows["c:latitude:float"]) > 500:
            rows["c:latitude:float"] = float(rows["c:latitude:float"]) - 1000.0
        if not rows["c:longitude:float"] is None and float(rows["c:longitude:float"]) > 500:
            rows["c:longitude:float"] = float(rows["c:longitude:float"]) - 1000.0

        # The random salt is added to the result score during ranking to prevent
        # groups of near-identical results with identical scores from appearing
        # together in the same result pages without harming quality.
        rows["c:randomsalt:float"] = str(random.uniform(0.0, 1.0))

        csv_writer.writerow(rows)
        numopps += 1

    data_file.close()
    print_progress("bad links: %d" % num_bad_links)
    print_progress("  expired: %d" % expired_by_end_date)

    # NOTE: if you change this, you also need to update datahub/load_gbase.py
    # and frontend/views.py to avoid breaking the dashboard-- other status
    # messages don't matter.
    elapsed = datetime.now() - start_time
    xmlh.print_status(
        "done parsing: output "
        + str(footprint_lib.NUMORGS)
        + " organizations"
        + " and "
        + str(numopps)
        + " opportunities"
        + " ("
        + str(feed_file_size)
        + " bytes): "
        + str(int(elapsed.seconds / 60))
        + " minutes.",
        shortname,
    )

    proper_name = shortname
    if shortname in providers.ProviderNames:
        proper_name = providers.ProviderNames[shortname].get("name", shortname)

    # do the per-provider summary
    if shortname:
        processed = str(datetime.now()).split(".")[0]

        try:
            fh = open(FEEDSDIR + "/" + shortname + "-last.txt", "r")
        except:
            fh = None
            footprint_stats = None

        if fh:
            footprint_stats = fh.read()
            fh.close()

        fh = open(FEEDSDIR + "/" + shortname + "-history.txt", "a")
        if fh:
            fh.write("processed\t" + processed + "\n")
            fh.write("elapsed\t" + str(int(elapsed.seconds / 60)) + "\n")
            fh.write("bytes\t" + str(feed_file_size) + "\n")
            fh.write("numopps\t" + str(numopps) + "\n")
            fh.write("expired\t" + str(expired_by_end_date) + "\n")
            fh.write("badlinks\t" + str(num_bad_links) + "\n")
            if footprint_stats:
                fh.write(footprint_stats)
            fh.write("proper_name\t" + proper_name + "\n")
            fh.close()

    return out_filename
Beispiel #50
0
def file_choice(tables, verbose):
    """
    Choose the right summary file component for the given Census table
    """

    # code originally in readcsv.py by Peter Gao
    datareader = DictReader(open(dirname(argv[0]) + "/sf1_data_field_descriptors_2010.csv"))
    data = []
    entry = None
    prevCol = None
    current_table = ""
    for line in datareader:
        new_table_number = line['TABLE NUMBER']
        if new_table_number != current_table:
            # save the old one
            if entry != None:
                data.append(entry)

            entry = {}
            current_table = new_table_number
            entry['Matrix Number'] = line['TABLE NUMBER']
            entry['File Name'] = line['SEGMENT']
            next_line = datareader.next()
            entry['Universe'] = (next_line['FIELD NAME'][9:].lstrip())

            entry['Name'] = line['FIELD NAME'][:line['FIELD NAME'].index('[')-1]
            entry['Cell Count'] = 0
            entry['Field Names'] = []

        # Increment the cell count iff there's actually data, rather than this being a descriptive row,
        # and save the column name
        if len(line['FIELD CODE']) > 0:
            entry['Cell Count'] += 1
            entry['Field Names'].append(line['FIELD CODE'])

            # sanity check: ensure the columns are stored in order
            if entry['Cell Count'] == 1:
                assert int(re.sub('[A-Z]', '', line['FIELD CODE'][-4:])) == 1,\
                    'Field names not stored in order for matrix %s: first column is %s' % (entry['Matrix Number'], line['FIELD CODE'])
            else:
                assert int(re.sub('[A-Z]', '', line['FIELD CODE'][-4:])) == int(re.sub('[A-Z]', '', prevCol[-4:])) + 1,\
                    'Field names are not stored in order for matrix %s: column %s follows column %s' %\
                    (entry['Matrix Number'], line['FIELD CODE'], prevCol)

            prevCol = line['FIELD CODE']

    files = []
    
    for table in tables:
        file_name, column_offset = None, 5
        
        for row in data:
            curr_file, curr_table, cell_count = row.get('File Name'), row.get('Matrix Number'), int(row.get('Cell Count'))
            
            if curr_file != file_name:
                file_name, column_offset = curr_file, 5
        
            if curr_table == table:
                if verbose:
                    print >> stderr, table, '-', row.get('Name'), 'in', row.get('Universe')
    
                files.append((table, file_name, column_offset, cell_count, row.get('Field Names')))
                break
            
            column_offset += cell_count
        
    return files
Beispiel #51
0
def upload_waterpoints(filename, skip=0, limit=None):
    """Upload waterpoints from a CSV file."""
    # Use sys.stdout.write so waterpoints can be printed nicely and succinctly
    import sys

    date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d')
    bool_converter = lambda s: s == "true"

    status_map = {
        "non functional": "not functional",
        "functional needs repair": "needs repair"
    }

    status_converter = lambda s: status_map.get(s.lower(), s.lower())

    convert = {
        'gid': int,
        'object_id': int,
        'valid_from': date_converter,
        'valid_to': date_converter,
        'amount_tsh': float,
        'breakdown_year': int,
        'date_recorded': date_converter,
        'gps_height': float,
        'latitude': float,
        'longitude': float,
        'num_private': int,
        'region_code': int,
        'district_code': int,
        'population': int,
        'public_meeting': bool_converter,
        'construction_year': int,
        'status_group': status_converter
    }

    def print_flush(msg):
        sys.stdout.write(msg)
        sys.stdout.flush()

    facility_code = "wpf001"
    print_every = 1000
    print_flush("Adding waterpoints. Please be patient.")

    with open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            actual_index = i + skip + 2
            do_print = actual_index % print_every == 0
            try:
                d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
                coords = [d.pop('longitude'), d.pop('latitude')]
                d['location'] = {'type': 'Point', 'coordinates': coords}
                d['facility_code'] = facility_code
                if not check(add_document('waterpoints', d), 201, False):
                    raise Exception()
                if do_print:
                    print_flush(".")

            except Exception as e:
                print "Error adding waterpoint", e
                pprint(d)
                exit()

            if limit and i >= limit:
                break
    # Create a 2dsphere index on the location field for geospatial queries
    app.data.driver.db['resources'].ensure_index([('location', '2dsphere')])
    print "Waterpoints uploaded!"
class CourtDataProcessor:
    def __init__(self, court_type, dob_start, dob_end):
        self.court_type = court_type
        self.dob_start = dob_start
        self.in_filepath = '{}_{}_{}.csv'.format(dob_start, dob_end,
                                                 court_type)

        self.download_data(dob_start, dob_end)

        self.in_file = open(self.in_filepath)
        self.data_reader = DictReader(self.in_file)

        self.last_person = None

    def download_data(self, dob_start, dob_end):
        # PGHOST, PGDATABASE, PGUSER, PGPASSWORD
        if self.court_type == 'district':
            gender_field = 'Gender'
            name_field = 'Name'
            table = 'DistrictCriminalCase'
        else:
            gender_field = 'Sex'
            name_field = 'Defendant'
            table = 'CircuitCriminalCase'

        copy_cmd = '\\copy (SELECT id, "{}", "{}", "DOB", "Address" FROM "{}"'.format(
            gender_field, name_field, table)
        copy_cmd += ' WHERE "DOB" >= \'{}\' AND "DOB" <= \'{}\''.format(
            dob_start, dob_end)
        copy_cmd += ' ORDER BY "{}", "DOB", "{}") To \'{}\' With CSV HEADER;'.format(
            gender_field, name_field, self.in_filepath)

        psql_cmd = ['psql', '-c', copy_cmd]
        print self.in_filepath, subprocess.check_output(psql_cmd)

    def close(self):
        self.in_file.close()
        os.remove(self.in_filepath)

    def next_people(self, gender_group, dob_group, letter_group):
        people = []
        while True:
            if self.last_person is not None:
                person = self.last_person
                self.last_person = None
            else:
                try:
                    person = self.data_reader.next()
                except StopIteration:
                    break

            gender = person['Gender'] if 'Gender' in person else person['Sex']
            name = person['Name'] if 'Name' in person else person['Defendant']
            dob = person['DOB']

            if gender not in GENDERS:
                continue
            if name[0] not in LETTERS:
                continue

            if gender == gender_group and dob == dob_group and name.startswith(
                    letter_group):
                people.append({
                    'id': person['id'],
                    'name': name,
                    'address': person['Address'],
                    'courtType': self.court_type
                })
            else:
                self.last_person = person
                break
        return people
Beispiel #53
0
 def next(self): # For Python 2
     row = DictReader.next(self)
     for (att, func) in self._casts.items():
         row[att] = func(row[att])
     return row
Beispiel #54
0
def buildRootFile(fname, outputdir):
  

  if outputdir.rstrip('/') == '.':
    outFname = os.path.basename(fname) + '.root'
  else:
    outFname = outputdir.rstrip('/') + '/' + os.path.basename(fname) +'.root'
    
  print 'building root file from', fname, '>> output:', outFname
  
  #loop on file for upload
  file = open(fname)
  reader = DictReader(open(fname), delimiter=' ', skipinitialspace = True)
  
  try:
      doc = parseDoc(reader.next()) #read the first line
  except Exception as e:
      print e
      return None

  #here, i search through the key/value pairs of doc trying to determine
  #the type of value and then creating a list of variable names, an array for 
  #each variable stored in a list (necessary for TTree branching), and then
  #a TTree branch format descriptor 
  varNames = list()
  arrayList = list()
  descriptor = list()
  for k, v in doc.items():
    #print k, v
    name = formatname(k)
    
    if isinstance(v,str)==False:
      if isinstance(v, int): #int
          varNames.append(name)
          if name == 'Stamp' or name == 'Position' or name == 'GigaStamp' or name == 'Evt' or re.match('List',name) or re.match('Date',name):
            arrayList.append(np.arange(1, dtype=np.uint32))
            descriptor.append(str(name) + '/i')
          else:
            arrayList.append(array.array('i',[0]))  #we have to use arrays because of the way that Python deals with memory and the way that TTrees deal with memory
            descriptor.append(str(name) + '/I')
      else: #must be a float
          try:
            if math.isnan(float(v))==False:
              varNames.append(name) 
              arrayList.append(array.array('f',[0]))
              descriptor.append(str(name) + '/F')
          except:
            pass
        
    else:  #must be a string
      # we're skipping strings.
      #varNames.append(name)
      #arrayList.append(array.array('i',[0]))
      #descriptor.append(str(name) + '/C')
      pass

  file = TFile.Open(outFname, 'recreate')
  tree = TTree('ntp_tree','A Tree based on the Samba NTP files.')
  
  #print varNames
  #print arrayList
  #print descriptor
  
  for i in range(len(arrayList)): #set up the branches
    tree.Branch(varNames[i],arrayList[i],descriptor[i]) 
  
  #re-read the file so that we start at the first line
  reader = DictReader(open(fname), delimiter=' ', skipinitialspace = True)
  try:
    for line in reader:
      #print 'next line'
      line = parseDoc(line)
      for k, v in line.items():  
        name = formatname(k)    
        try:                      
          i = varNames.index(name)  #its not guaranteed the the order of key/value pair is
                                  #maintained. So, we have to use the list.index function
                                  #to find the proper index for this particular key
          try:
            arrayList[i][0] = v       #set the value to the proper array  (arrayList[i] returns an array and arrayList[i][0] is the zero'th element of the array)
          except OverflowError:
            print i
            print k, v
            raise OverflowError

            #print k, v
          #print i, arrayList[i][0]
        except ValueError:
          pass  #this will throw if varNames doesn't have an index named 'name' In the code above,
              #strings are ignored. so when we come across a key that isn't in our list,
              #which is probably a string, we ignore it here.
      #print 'fill'
      tree.Fill()
  except Exception as e:
    print e
      
  file.cd()
  tree.Write()
  file.Close()
  
  return outFname