Esempio n. 1
0
 def load(self):
     self._common_kwargs = self._build_common_election_kwargs()
     self._common_kwargs['reporting_level'] = 'precinct'
     # Store result instances for bulk loading
     results = []
     fieldnames = [
         'county_code', 'county_name', 'election_number', 'election_date',
         'election_name', 'precinct_id', 'polling_location',
         'registered_voters', 'registered_republicans',
         'registered_democrats', 'registered_others', 'contest_name',
         'district', 'contest_code', 'candidate', 'party', 'candidate_id',
         'doe_candidate_number', 'votes'
     ]
     with self._file_handle as tsvfile:
         tsv = [x.replace('\0', '') for x in tsvfile]  # remove NULL bytes
         reader = unicodecsv.DictReader(tsv,
                                        fieldnames=fieldnames,
                                        delimiter='\t')
         for row in reader:
             if self._skip_row(row):
                 continue
             results.append(self._prep_precinct_result(row))
     RawResult.objects.insert(results)
Esempio n. 2
0
def getAuthDict(auth_doc_path):
    """ makes a dictionary for the accepted values present in an autority
    document """

    auth_dict = {}
    with open(auth_doc_path, 'rU') as f:
        fields = [
            'conceptid', 'Preflabel', 'altlabels', 'ParentConceptid',
            'ConceptType', 'Provider'
        ]
        rows = unicodecsv.DictReader(f,
                                     fieldnames=fields,
                                     encoding='utf-8-sig',
                                     delimiter=',',
                                     restkey='ADDITIONAL',
                                     restval='MISSING')
        rows.next()
        rownum = 2
        for row in rows:
            auth_dict[row['conceptid']] = row['Preflabel']
            rownum += 1

    return auth_dict
Esempio n. 3
0
    def verify_rows_in_csv(self, expected_rows, verify_order=True):
        """
        Verify that the last ReportStore CSV contains the expected content.

        Arguments:
            expected_rows (iterable): An iterable of dictionaries,
                where each dict represents a row of data in the last
                ReportStore CSV.  Each dict maps keys from the CSV
                header to values in that row's corresponding cell.
            verify_order (boolean): When True, we verify that both the
                content and order of `expected_rows` matches the
                actual csv rows.  When False (default), we only verify
                that the content matches.
        """
        report_store = ReportStore.from_config()
        report_csv_filename = report_store.links_for(self.course.id)[0][0]
        with open(report_store.path_to(self.course.id, report_csv_filename)) as csv_file:
            # Expand the dict reader generator so we don't lose it's content
            csv_rows = [row for row in unicodecsv.DictReader(csv_file)]
            if verify_order:
                self.assertEqual(csv_rows, expected_rows)
            else:
                self.assertItemsEqual(csv_rows, expected_rows)
Esempio n. 4
0
def load_patron_data_file(file_path, non_distance_zip_codes):
    patron_data = {}

    csv_file = open(file_path, 'rb')
    csv_reader = unicodecsv.DictReader(csv_file, delimiter=',', encoding='ISO-8859-1') 
    for row in csv_reader:
        distance = False
    
        if row['zip_1'] and row['zip_1'][:5] not in non_distance_zip_codes:
            distance = True
        try:
            if row['street_line1'] == '':
                logging.warn("Mandatory field street_line1 is not present in record %s" % row['id_number'])
            elif row['email'] == '':
                logging.warn("Mandatory field email is not present in record %s" % row['id_number'])
            else:
                patron_data[row['id_number']] = Patron(row, distance)
        except ValueError as error:
            logging.warn(error.args)

    csv_file.close()

    return patron_data
Esempio n. 5
0
File: csvw.py Progetto: raadjoe/COW
    def _simple(self):
        """Starts a single process for converting the file"""
        with open(self.target_file, 'w') as target_file:
            with open(self.file_name, 'rb') as csvfile:
                logger.info("Opening CSV file for reading")
                reader = csv.DictReader(csvfile,
                                        encoding=self.encoding,
                                        delimiter=self.delimiter,
                                        quotechar=self.quotechar)

                logger.info("Starting in a single process")
                c = BurstConverter(self.np.ag.identifier, self.columns,
                                   self.schema, self.metadata_graph,
                                   self.encoding, self.output_format)
                # Out will contain an N-Quads serialized representation of the
                # converted CSV
                out = c.process(0, reader, 1)
                # We then write it to the file
                target_file.write(out)

            self.convert_info()
            # Finally, write the nanopublication info to file
            target_file.write(self.np.serialize(format=self.output_format))
Esempio n. 6
0
    def post(self, request, *args, **kwargs):
        file = request.data.get(u'file', None)
        delimiter = str(request.data.get('delimiter', None))
        encoding = request.data.get(u'encoding', None)

        if request.data.get(u'mapping'):
            mapping = json.loads(request.data.get(u'mapping'))
        else:
            mapping = dict()

        headers = csv.reader(file, encoding=encoding,
                             delimiter=delimiter).next()

        reader = csv.DictReader(file,
                                fieldnames=headers,
                                encoding=encoding,
                                delimiter=delimiter)

        result = []
        for v in self.generate(reader, mapping):
            result.append(v)

        return response.Response(data=result, status=status.HTTP_200_OK)
Esempio n. 7
0
def test_source_sheet_disambiguator():
    ids = [
        1697, 2636, 8689, 11419, 13255, 16085, 18838, 26981, 27226, 31603,
        31844, 35830, 49364, 50853, 57106, 65498, 78110, 85003, 90289, 92571,
        101667, 105718
    ]
    rows = []
    start_row = 0
    with open("test.csv", "rb") as fin:
        cin = unicodecsv.DictReader(fin)
        for i, id in enumerate(ids):
            if i % 1 == 0:
                print("{}/{}".format(i, len(ids)))
            sheet = db.sheets.find_one({"id": id})
            if not sheet:
                print("continue")
                continue
            new_rows = mutate_sheet(sheet, refine_ref_by_text)
            rows += new_rows
            for irow in range(start_row, start_row + len(new_rows)):
                csv_row = next(cin)
                assert csv_row == rows[irow]
            start_row += len(new_rows)
def convert_csv_to_json(filename_csv='datasets.csv',
                        filename_json='datasets.json',
                        encoding='latin-1'):
    num_rows = 0
    with open(filename_csv, 'r') as fp_read, open(filename_json,
                                                  'w') as fp_write:
        fp_write.write(u'{\n\t"datasets": [\n')
        reader = unicodecsv.DictReader(fp_read, encoding=encoding)
        for num_rows, row in enumerate(reader, start=1):
            fp_write.write(u"\t\t%s,\n" % json.dumps(row))
        # We need to back up a couple of spaces to delete the last comma.
        # from the seek() docs:
        #   To change the file object's position, use f.seek(offset, from_what).
        #   The position is computed from adding offset to a reference point;
        #   the reference point is selected by the from_what argument. A
        #   from_what value of 0 measures from the beginning of the file, 1 uses
        #   the current file position, and 2 uses the end of the file as the
        #   reference point. from_what can be omitted and defaults to 0, using
        #   the beginning of the file as the reference point.
        # via https://docs.python.org/2/tutorial/inputoutput.html
        fp_write.seek(-2, 1)
        fp_write.write(u'\n]}')
    return num_rows
Esempio n. 9
0
 def load_non2002_file(self, mapping):
     with self._file_handle as csvfile:
         results = []
         target_offices = set([
             'President - Vice Pres', 'U.S. Senator', 'U.S. Congress',
             'Governor / Lt. Governor', 'Comptroller', 'Attorney General',
             'State Senator', 'House of Delegates'
         ])
         reader = unicodecsv.DictReader(csvfile, encoding='latin-1')
         for row in reader:
             # Skip non-target offices
             if not row['Office Name'].strip() in target_offices:
                 continue
             elif 'state_legislative' in self.source:
                 results.extend(
                     self._prep_non2002_state_leg_results(row, mapping))
             elif 'precinct' in self.source:
                 results.append(
                     self._prep_non2002_precinct_result(row, mapping))
             else:
                 results.append(
                     self._prep_non2002_county_result(row, mapping))
         Result.objects.insert(results)
Esempio n. 10
0
def parse_csv(file_stream, expected_columns=None):
    """
    Parse csv file and return a stream of dictionaries representing each row.

    First line of CSV file must contain column headers.

    Arguments:
         file_stream: input file
         expected_columns (set[unicode]): columns that are expected to be present

    Yields:
        dict: CSV line parsed into a dictionary.
    """
    reader = unicodecsv.DictReader(file_stream, encoding="utf-8")

    if expected_columns and set(expected_columns) - set(reader.fieldnames):
        raise ValidationError(ValidationMessages.MISSING_EXPECTED_COLUMNS.format(
            expected_columns=", ".join(expected_columns), actual_columns=", ".join(reader.fieldnames)
        ))

    # "yield from reader" would be nicer, but we're on python2.7 yet.
    for row in reader:
        yield row
Esempio n. 11
0
    def to_internal_value(self, data):
        super(JsonFileField, self).to_internal_value(data)

        if is_zipfile(data):
            with ZipFile(data) as zf:
                raw_data = zf.read(splitext(data.name)[0])
        else:
            data.seek(0)
            raw_data = data.read()

        try:
            data.json = json.loads(raw_data)
        except ValueError:
            try:
                data.json = json.loads(raw_data, encoding='cp1251')
            except ValueError:
                try:
                    lines = raw_data.splitlines()
                    dialect = csv.Sniffer().sniff(lines[0], [',', ';', '\t'])
                    data.json = [item for item in csv.DictReader(lines, dialect=dialect)]
                except (ValueError, csv.Error):
                    self.fail('json')
        return data
Esempio n. 12
0
    def load(self):
        headers = [
            'county', 'date', 'precinct_abbrev', 'precinct', 'contest',
            'choice', 'party', 'total_votes', 'timestamp'
        ]
        self._common_kwargs = self._build_common_election_kwargs()
        self._common_kwargs['reporting_level'] = 'precinct'
        # Store result instances for bulk loading
        results = []

        with self._file_handle as csvfile:
            reader = unicodecsv.DictReader(csvfile,
                                           delimiter='\t',
                                           fieldnames=headers,
                                           encoding='latin-1')
            for row in reader:
                if self._skip_row(row):
                    continue
                if row['precinct'] == 'absentee/provisional':
                    results.append(self._prep_county_result(row))
                else:
                    results.append(self._prep_precinct_result(row))
        RawResult.objects.insert(results)
Esempio n. 13
0
    def load(self):
        headers = [
            'candidate', 'office', 'district', 'county', 'votes', 'winner'
        ]
        self._common_kwargs = self._build_common_election_kwargs()
        self._common_kwargs['reporting_level'] = 'county'
        # Store result instances for bulk loading
        results = []

        with self._file_handle as csvfile:
            reader = unicodecsv.DictReader(csvfile, fieldnames=headers)
            for row in reader:
                if self._skip_row(row):
                    continue
                if row['county'].strip() == '':
                    total_votes = int(row['votes'].strip())
                    contest_winner = row['winner'].strip()
                else:
                    rr_kwargs = self._common_kwargs.copy()
                    rr_kwargs.update(self._build_contest_kwargs(row))
                    rr_kwargs.update(self._build_candidate_kwargs(row))
                    jurisdiction = row['county'].strip()
                    rr_kwargs.update({
                        'jurisdiction':
                        jurisdiction,
                        'ocd_id':
                        "{}/county:{}".format(self.mapping['ocd_id'],
                                              ocd_type_id(jurisdiction)),
                        'office':
                        row['office'].strip(),
                        'district':
                        row['district'].strip(),
                        'votes':
                        int(row['votes'].strip())
                    })
                    results.append(RawResult(**rr_kwargs))
        RawResult.objects.insert(results)
Esempio n. 14
0
    def load(self):
        headers = [
            'county', 'precinct', 'office', 'district', 'party', 'candidate',
            'votes', 'pct'
        ]
        self._common_kwargs = self._build_common_election_kwargs()
        self._common_kwargs['reporting_level'] = 'precinct'
        # Store result instances for bulk loading
        results = []

        with self._file_handle as csvfile:
            reader = unicodecsv.DictReader(csvfile, fieldnames=headers)
            for row in reader:
                if self._skip_row(row):
                    continue
                rr_kwargs = self._common_kwargs.copy()
                rr_kwargs['primary_party'] = row['party'].strip()
                rr_kwargs.update(self._build_contest_kwargs(row))
                rr_kwargs.update(self._build_candidate_kwargs(row))
                jurisdiction = row['precinct'].strip()
                county_ocd_id = [
                    c for c in self.datasource._jurisdictions()
                    if c['county'].upper() == row['county'].upper()
                ][0]['ocd_id']
                rr_kwargs.update({
                    'jurisdiction':
                    jurisdiction,
                    'parent_jurisdiction':
                    row['county'],
                    'ocd_id':
                    "{}/precinct:{}".format(county_ocd_id,
                                            ocd_type_id(jurisdiction)),
                    'votes':
                    self._votes(row['votes'])
                })
                results.append(RawResult(**rr_kwargs))
        RawResult.objects.insert(results)
Esempio n. 15
0
 def nibelis2pivot(self, fileobj):
     fieldnames = [
         'trasha', 'trashb', 'journal', 'trashd', 'trashe',
         'trashf', 'trashg', 'date', 'trashi', 'trashj', 'trashk',
         'trashl', 'trashm', 'trashn', 'account', 'trashp',
         'trashq', 'amount', 'trashs', 'sign', 'trashu',
         'trashv', 'name',
         'trashx', 'trashy', 'trashz', 'trashaa', 'trashab',
         'trashac', 'trashad', 'trashae', 'analytic']
     reader = unicodecsv.DictReader(
         fileobj,
         fieldnames=fieldnames,
         delimiter=';',
         quoting=False,
         encoding='latin1')
     res = []
     i = 0
     for l in reader:
         i += 1
         if i == 1:
             continue
         amount = float(l['amount'].replace(',', '.'))
         credit = l['sign'] == 'C' and amount or False
         debit = l['sign'] == 'D' and amount or False
         vals = {
             'journal': l['journal'],
             'account': l['account'],
             'credit': credit,
             'debit': debit,
             'date': datetime.strptime(l['date'], '%y%m%d'),
             'name': l['name'],
             'line': i,
         }
         if l.get('analytic'):
             vals['analytic'] = l['analytic']
         res.append(vals)
     return res
Esempio n. 16
0
def load_geonames_cities(countries):
    """
    Return list of cities of the world. Via geonames dump.
    http://download.geonames.org/export/dump/

    Only cities in list of countries `countries` are retained.
    Returns dict of mappings from a country code in `countries` to list
    of cities. Assumes countries are lower-case alpha2 codes.
    """
    fpath = 'dat/geonames_cities/cities1000.txt'
    fields = 'geonameid,name,asciiname,alternatenames,latitude,longitude,feature_class,feature_code,country_code,cc2,admin1_code,admin2_code,admin3_code,admin4_code,population,elevation,dem,timezone,modification_date'.split(
        ',')

    out = {code: [] for code in countries}

    with open(fpath) as fp:
        rdr = csv.DictReader(fp,
                             fieldnames=fields,
                             delimiter='\t',
                             quoting=csv.QUOTE_NONE)
        for row in rdr:
            cc = row['country_code'].lower()
            if cc in countries:
                lat = float(row['latitude'])
                lon = float(row['longitude'])
                name = row['name']
                ad1 = row['admin1_code']
                pop = int(row['population'])

                out[cc].append({
                    'city': name,
                    'state': ad1,
                    'population': pop,
                    'longitude': lon,
                    'latitude': lat
                })
    return out
Esempio n. 17
0
 def handle_api_teachers(self, http_context):
     school = 'default-school'
     path = '/etc/linuxmuster/sophomorix/' + school + '/teachers.csv'
     if os.path.isfile(path) is False:
         os.mknod(path)
     fieldnames = [
         'class',
         'last_name',
         'first_name',
         'birthday',
         'login',
         'password',
         'usertoken',
         'quota',
         'mailquota',
         'reserved',
     ]
     if http_context.method == 'GET':
         with authorize('lm:users:teachers:read'):
             return list(
                 csv.DictReader(CSVSpaceStripper(
                     open(path),
                     encoding=http_context.query.get('encoding', 'utf-8')),
                                delimiter=';',
                                fieldnames=fieldnames))
     if http_context.method == 'POST':
         with authorize('lm:users:teachers:write'):
             data = http_context.json_body()
             for item in data:
                 item.pop('_isNew', None)
             lmn_backup_file(path)
             with open(path, 'w') as f:
                 csv.DictWriter(f,
                                delimiter=';',
                                fieldnames=fieldnames,
                                encoding=http_context.query.get(
                                    'encoding', 'utf-8')).writerows(data)
Esempio n. 18
0
    def test_data_creation_from_base_row(
            self, mock_read_csv):
        """
        Confirm that loading a single row of real base data creates
        a CountyMortgageData object with the base row's values,
        and that the object's calculated API values are correct.
        """

        f = BytesIO(self.data_header + self.data_row)
        reader = unicodecsv.DictReader(f)
        mock_read_csv.return_value = reader
        load_values()
        self.assertEqual(CountyMortgageData.objects.count(), 1)
        county = CountyMortgageData.objects.first()
        fields = reader.fieldnames
        fields.pop(fields.index('fips'))  # test string separately
        fields.pop(fields.index('open'))  # 'open' is stored as 'total'
        fields.pop(fields.index('date'))  # date must be parsed before testing
        self.assertEqual(county.fips, self.data_row_dict.get('fips'))
        open_value = int(self.data_row_dict.get('open'))
        self.assertEqual(county.total, open_value)
        target_date = parser.parse(self.data_row_dict['date']).date()
        self.assertEqual(county.date, target_date)
        for field in fields:  # remaining fields can be tested in a loop
            self.assertEqual(
                getattr(county, field), int(self.data_row_dict.get(field)))
        # test computed values
        self.assertEqual(
            county.epoch,
            int(target_date.strftime('%s')) * 1000)
        self.assertEqual(
            county.percent_90,
            int(self.data_row_dict.get('ninety')) * 1.0 / open_value)
        self.assertEqual(
            county.percent_30_60,
            (int(self.data_row_dict.get('thirty')) +
             int(self.data_row_dict.get('sixty'))) * 1.0 / open_value)
Esempio n. 19
0
 def genericcsv2pivot(self, fileobj):
     fieldnames = [
         'date',
         'journal',
         'account',
         'analytic',
         'name',
         'debit',
         'credit',
     ]
     reader = unicodecsv.DictReader(fileobj,
                                    fieldnames=fieldnames,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=unicodecsv.QUOTE_MINIMAL,
                                    encoding='utf-8')
     res = []
     i = 0
     for l in reader:
         i += 1
         vals = {
             'journal': {
                 'code': l['journal']
             },
             'account': {
                 'code': l['account']
             },
             'credit': float(l['credit'] or 0),
             'debit': float(l['debit'] or 0),
             'date': datetime.strptime(l['date'], '%d/%m/%Y'),
             'name': l['name'],
             'line': i,
         }
         if l['analytic']:
             vals['analytic'] = {'code': l['analytic']}
         res.append(vals)
     return res
Esempio n. 20
0
def load_fips_meta(counties=True):
    """`
    Load FIPS mappings, starting with base CSV files.

    County CSV headings are:
        1: state
        2: state_fips
        3: county_fips
        4: complete_fips
        5: county_name

    MSA CSV headings are:
        1: msa_id
        2: msa_name
        3: county_fips
        4: county_name
    """
    for filename in ['state_county_fips.csv', 'msa_county_crosswalk.csv']:
        with open("{}/{}".format(FIPS_DATA_PATH, filename), 'rb') as f:
            reader = unicodecsv.DictReader(f)
            fips_data = [row for row in reader]
            if 'state' in filename:
                FIPS.county_fips = {
                    row['complete_fips']: {
                        'county': row['county_name'],
                        'fips': row['complete_fips'],
                        'state': row['state'],
                        'name': row['county_name'],
                    }
                    for row in fips_data if row['state'] not in NON_STATES
                }
            else:
                FIPS.msa_fips = assemble_msa_mapping(fips_data)
    load_fips_lists()
    if counties is True:
        load_county_mappings()
    load_constants()
Esempio n. 21
0
    def jurisdiction_mappings(self, filename=None):
        """
        Retrieve jurisdictional mappings based on OCD IDs.

        Args:
            filename: Filename of the CSV file containing jurisdictional
            mappings.  Default is
            openelex/us/{state_abbrev}/mappings/{state_abbrev}.csv.

        Returns:
            A list of dictionaries containing jurisdiction Open Civic Data
            identifiers, jurisdiction names and other metadata about
            the jurisdiction.  The return dictionaries include a
            value for each column in the input CSV.

            Example jurisdiction mapping dictionary:

            ```
            {
                'ocd_id': 'ocd-division/country:us/state:ar/county:washington',
                'fips': '05143',
                'name': 'Washington'
            }
            ```

        """
        try:
            return self._cached_jurisdiction_mappings
        except AttributeError:
            if filename is None:
                filename = join(self.mappings_dir, self.state + '.csv')

            with open(filename, 'rU') as csvfile:
                reader = unicodecsv.DictReader(csvfile)
                self._cached_jurisdiction_mappings = [row for row in reader]

            return self._cached_jurisdiction_mappings
Esempio n. 22
0
def loadCSV(csvFileName, rowKeyName=None):
    """

    Args:
        csvFileName:
        rowKeyName:

    Returns:

    """

    print "Loading " + csvFileName
    csv_fp = open(csvFileName, "rbU")
    dictRecords = {}
    fields = {}

    csv_reader = None
    try:
        with csv_fp:
            csv_reader = unicodecsv.DictReader(csv_fp,
                                               delimiter=",",
                                               quoting=unicodecsv.QUOTE_ALL,
                                               errors='strict')
            fields = csv_reader.fieldnames
            for row in csv_reader:
                if rowKeyName is None:
                    rowKeyName = fields[0]

                dictRecords[row[rowKeyName]] = row
    except Exception as err:
        print err
        pass

    print "Loaded " + str(len(dictRecords)) + " rows from " + csvFileName

    return {'fieldnames': fields, 'dict': dictRecords}
Esempio n. 23
0
 def handle_api_extra_courses(self, http_context):
     path = '/etc/sophomorix/user/extrakurse.txt'
     fieldnames = [
         'course',
         'base_name',
         'count',
         'birthday',
         'gecos',
         'password',
         'removal_date',
     ]
     if http_context.method == 'GET':
         with authorize('lm:users:extra-courses:read'):
             return list(
                 csv.DictReader(
                     CSVSpaceStripper(
                         open(path),
                         encoding=http_context.query.get('encoding', 'utf-8')
                     ),
                     delimiter=';',
                     fieldnames=fieldnames
                 )
             )
     if http_context.method == 'POST':
         with authorize('lm:users:extra-courses:write'):
             data = http_context.json_body()
             for item in data:
                 item.pop('_isNew', None)
             lm_backup_file(path)
             with open(path, 'w') as f:
                 csv.DictWriter(
                     f,
                     delimiter=';',
                     fieldnames=fieldnames,
                     encoding=http_context.query.get('encoding', 'utf-8')
                 ).writerows(data)
Esempio n. 24
0
    def _run_task(self):
        for obj in self.options['objects']:
            self.logger.info('Deleting all {} records'.format(obj))
            # Query for all record ids
            self.logger.info('  Querying for all {} objects'.format(obj))
            query_job = self.bulk.create_query_job(obj, contentType='CSV')
            batch = self.bulk.query(query_job, "select Id from {}".format(obj))
            while not self.bulk.is_batch_done(batch, query_job):
                time.sleep(10)
            self.bulk.close_job(query_job)
            delete_rows = []
            for result in self.bulk.get_all_results_for_query_batch(
                    batch, query_job):
                reader = unicodecsv.DictReader(result, encoding='utf-8')
                for row in reader:
                    delete_rows.append(row)

            if not delete_rows:
                self.logger.info(
                    '  No {} objects found, skipping delete'.format(obj))
                continue

            # Delete the records
            delete_job = self.bulk.create_delete_job(obj, contentType='CSV')
            self.logger.info('  Deleting {} {} records'.format(
                len(delete_rows), obj))
            batch_num = 1
            for batch in self._upload_batch(delete_job, delete_rows):
                self.logger.info('    Uploaded batch {}'.format(batch))
                while not self.bulk.is_batch_done(batch, delete_job):
                    self.logger.info(
                        '      Checking status of batch {0}'.format(batch_num))
                    time.sleep(10)
                self.logger.info('      Batch {} complete'.format(batch))
                batch_num += 1
            self.bulk.close_job(delete_job)
Esempio n. 25
0
    def _test_csv(self):
        doc = self.window.document
        csv = doc.document_path.with_suffix('.csv')

        # Check CSV contents
        with csv.open('rb') as infile:
            res = unicodecsv.DictReader(infile, encoding='utf-8')
            for index, item, row in zip(count(), doc.items, res):
                expected = item['fields']
                expected.update({
                    'ItemNumber':
                    '{0}'.format(1 + index),
                    'Cropped_image_name':
                    '{0:04}.jpg'.format(1 + index),
                })
                actual = {
                    k: v
                    for k, v in row.items()
                    if v and k not in BOUNDING_BOX_FIELD_NAMES
                }
                self.assertEqual(expected, actual)

        # Expect 4 rows
        self.assertEqual(index, 4)
Esempio n. 26
0
    def _run_query(self, soql, mapping):
        self.logger.info(
            'Creating bulk job for: {sf_object}'.format(**mapping))
        job = self.bulk.create_query_job(mapping['sf_object'],
                                         contentType='CSV')
        self.logger.info('Job id: {0}'.format(job))
        self.logger.info('Submitting query: {}'.format(soql))
        batch = self.bulk.query(job, soql)
        self.logger.info('Batch id: {0}'.format(batch))
        self.bulk.wait_for_batch(job, batch)
        self.logger.info('Batch {0} finished'.format(batch))
        self.bulk.close_job(job)
        self.logger.info('Job {0} closed'.format(job))

        field_map = {}
        for field in self._fields_for_mapping(mapping):
            field_map[field['sf']] = field['db']

        for result in self.bulk.get_all_results_for_query_batch(batch, job):
            reader = unicodecsv.DictReader(result, encoding='utf-8')
            for row in reader:
                self._import_row(row, mapping, field_map)

        self.session.commit()
Esempio n. 27
0
def readData(input_file, field_names, prefix=None):
    """
    Read in our data from a CSV file and create a dictionary of records, 
    where the key is a unique record ID and each value is a 
    [frozendict](http://code.activestate.com/recipes/414283-frozen-dictionaries/) 
    (hashable dictionary) of the row fields.

    **Currently, dedupe depends upon records' unique ids being integers
    with no integers skipped. The smallest valued unique id must be 0 or
    1. Expect this requirement will likely be relaxed in the future.**
    """

    data = {}

    reader = csv.DictReader(StringIO(input_file))
    for i, row in enumerate(reader):
        clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
        if prefix:
            row_id = u"%s|%s" % (prefix, i)
        else:
            row_id = i
        data[row_id] = dedupe.core.frozendict(clean_row)

    return data
Esempio n. 28
0
    def load(self):
        headers = [
            'parish',
            'office',
            'district',
            'party',
            'candidate',
            'votes'
        ]
        self._common_kwargs = self._build_common_election_kwargs()
        self._common_kwargs['reporting_level'] = 'parish'
        # Store result instances for bulk loading
        results = []

        with self._file_handle as csvfile:
            reader = unicodecsv.DictReader(csvfile, fieldnames = headers, encoding='latin-1')
            for row in reader:
                if self._skip_row(row):
                    continue
                rr_kwargs = self._common_kwargs.copy()
                rr_kwargs.update(self._build_contest_kwargs(row))
                rr_kwargs.update(self._build_candidate_kwargs(row))
                results.append(RawResult(**rr_kwargs))
        RawResult.objects.insert(results)
def test_convert(datadir):
    codebook = datadir.join('qds_input_fixture.csv').open()

    converted = cqo.convert(codebook)

    reader = csv.DictReader(converted, encoding='utf-8', delimiter=',')

    row1 = reader.next()
    row2 = reader.next()
    row3 = reader.next()

    assert row1['field'] == u'TODAY'
    assert row1['table'] == u'Test_Schema_Title'
    assert row1['type'] == u'string'

    assert row2['field'] == u'GENDER'
    assert row2['table'] == u'Test_Schema_Title'
    assert row2['type'] == u'choice'

    assert row3['field'] == u'BIRTHSEX'
    assert row3['table'] == u'Test_Schema_Title'
    assert row3['type'] == u'choice'

    converted.close()
Esempio n. 30
0
 def load(self):
     with self._file_handle as csvfile:
         results = []
         reader = unicodecsv.DictReader(csvfile)
         for row in reader:
             # Skip non-target offices
             if self._skip_row(row):
                 continue
             elif any(s in self.mapping['generated_filename']
                      for s in ['2008', '2010', '2011']):
                 if row['Type'] == 'County':
                     results.append(self._prep_county_result(row))
                 else:
                     continue
             elif '__precinct__' not in self.mapping['generated_filename']:
                 if row['CountyName'] == '':
                     continue
                 results.append(self._prep_county_result(row))
             elif any(county == row['CountyName'] for county in
                      ['Kanawha', 'Marshall', 'Nicholas', 'Cabell']):
                 results.append(self._prep_github_precinct_result(row))
             else:
                 results.append(self._prep_precinct_result(row))
         RawResult.objects.insert(results)