Example #1
0
 def __init__(self,refGene_fn) :
     refGene_f = open(refGene_fn)
     # check for header
     first_line = refGene_f.next()
     if not first_line.strip().startswith('#') :
         refGene_f.seek(0) # first line not header, reset the file pointer
     DictReader.__init__(self,refGene_f,delimiter='\t',fieldnames=RefGeneOutput.FIELD_NAMES)
Example #2
0
def action(args):
    def newname(leaf, newname):
        leaf.name = newname
        return leaf

    tree = Phylo.parse(args.tree, args.tree_type).next()
    leafs = (leaf for leaf in tree.get_terminals())

    if args.info:
        info = DictReader(args.info, fieldnames = ['seqname','newname'])
        info = {i['seqname']:i['newname'] for i in info}

        # for newick trees :s will be replaced by |s
        if args.tree_type == 'newick':
            info = {s.replace(':', '|'):n for s,n in info.items()}

        leafs = (l for l in leafs if l.name in info)
        leafs = (newname(l, info[l.name]) for l in leafs)

    if args.remove_word:
        leafs = (newname(l, re.sub(args.remove_word, '', l.name)) for l in leafs)
        leafs = (newname(l, l.name.strip()) for l in leafs)

    leafs = (newname(l, args.add_prefix + l.name) for l in leafs)
    leafs = (newname(l, l.name + args.add_suffix) for l in leafs)

    # do this last
    if args.tree_type == 'newick':
        leafs = (newname(l, l.name.replace(' ', '_')) for l in leafs)

    # execute changes and write tree
    list(leafs)
    Phylo.write(tree, args.out, args.tree_type)
Example #3
0
    def test_subset_with_shapefile_no_ugid(self):
        """Test a subset operation using a shapefile without a UGID attribute."""

        output_format = [constants.OUTPUT_FORMAT_NUMPY, constants.OUTPUT_FORMAT_CSV_SHAPEFILE]

        geom = self.get_shapefile_path_with_no_ugid()
        geom_select_uid = [8, 11]
        geom_uid = 'ID'
        rd = self.test_data.get_rd('cancm4_tas')

        for of in output_format:
            ops = OcgOperations(dataset=rd, geom=geom, geom_select_uid=geom_select_uid, geom_uid=geom_uid, snippet=True,
                                output_format=of)
            self.assertEqual(len(ops.geom), 2)
            ret = ops.execute()
            if of == constants.OUTPUT_FORMAT_NUMPY:
                for element in geom_select_uid:
                    self.assertIn(element, ret)
                self.assertEqual(ret.properties[8].dtype.names, ('STATE_FIPS', 'ID', 'STATE_NAME', 'STATE_ABBR'))
            else:
                with open(ret) as f:
                    reader = DictReader(f)
                    row = reader.next()
                    self.assertIn(geom_uid, row.keys())
                    self.assertNotIn(env.DEFAULT_GEOM_UID, row.keys())

                shp_path = os.path.split(ret)[0]
                shp_path = os.path.join(shp_path, 'shp', '{0}_gid.shp'.format(ops.prefix))
                with fiona.open(shp_path) as source:
                    record = source.next()
                    self.assertIn(geom_uid, record['properties'])
                    self.assertNotIn(env.DEFAULT_GEOM_UID, record['properties'])
def load_data(uri, dateFormat):
    logging.info('loading data; uri: {0}'.format(uri))
    
    from urllib2 import urlopen
    from csv import DictReader
    
    reader = DictReader(urlopen(uri).readlines())
    
    encodedFieldNames = []
    for fieldname in reader.fieldnames:
        encodedFieldNames.append(fieldname.decode("utf-8-sig").encode("utf-8"))
    reader.fieldnames = encodedFieldNames
    
    data = []
    
    from time import strptime
    
    for row in reader:
        data.append({
            'date': strptime(row['Date'], dateFormat),
            'open': float(row['Open']),
            'close': float(row['Close']),
            'high': float(row['High']),
            'low': float(row['Low']),
            'volume': float(row['Volume'])
        })
    
    return data
Example #5
0
 def __init__(self, fid, commentchar='#', *args, **kwds):
     if issubclass(DictReader, object):
         super(DictReader, self).__init__(fid, *args, **kwds)
     else:
         DictReader.__init__(self, fid, *args, **kwds)
     self.commentchar = commentchar
     self.leadingfield = self.commentchar + 'label'
Example #6
0
 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
              dialect="excel", *args, **kw):
     DictReader.__init__(self, f, fieldnames=fieldnames,
                         restkey=restkey, restval=restval,
                         dialect=dialect, *args, **kw)
     # Replace the reader with our unicode-enabled reader.
     self.reader = UnicodeReader(f, dialect=dialect, *args, **kw)
Example #7
0
def upload_resources(filename, skip=0, limit=None):
    """Upload  from a CSV file."""
    # Use sys.stdout.write so resources can be printed nicely and succinctly
    import sys

    date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d')
    bool_converter = lambda s: s == "true"
    resource_schema = facility_schema['fields']
    
    convert_map = {
		'integer': int,
		'float': float,
		'datetime': date_converter,
		'boolean': bool_converter
    }

    convert = {}
    
    for k, v in resource_schema.items():
		field_type = v.get('type')
		if convert_map.has_key(field_type):
			convert[k] = convert_map[field_type]

    def print_flush(msg):
        sys.stdout.write(msg)
        sys.stdout.flush()

    facility_code = facility_schema['facility_code']
    print_every = 1000
    print_flush("Adding resources. Please be patient.")

    with open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            actual_index = i + skip + 2
            do_print = actual_index % print_every == 0
            try:
                d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
                coords = [d.pop('longitude', None), d.pop('latitude', None)]
                if coords[0] and coords[1]:
                    d['location'] = {'type': 'Point', 'coordinates': coords}
                d['facility_code'] = facility_code
                if not check(add_document(facility_schema['endpoint'], d), 201, False):
                    raise Exception()
                if do_print:
                    print_flush(".")

            except Exception as e:
                print "Error adding resource", e
                pprint(d)
                exit()

            if limit and i >= limit:
                break
    # Create a 2dsphere index on the location field for geospatial queries
	app.data.driver.db['resources'].create_index([('location', '2dsphere')])
    print "Resources uploaded!"
Example #8
0
def upload_waterpoints(filename, skip=0, limit=None):
    """Upload waterpoints from a CSV file."""
    date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d')
    bool_converter = lambda s: s == "true"

    status_map = {
        "non functional": "not functional",
        "functional needs repair": "needs repair"
    }

    status_converter = lambda s: status_map.get(s.lower(), s.lower())

    convert = {
        'gid': int,
        'object_id': int,
        'valid_from': date_converter,
        'valid_to': date_converter,
        'amount_tsh': float,
        'breakdown_year': int,
        'date_recorded': date_converter,
        'gps_height': float,
        'latitude': float,
        'longitude': float,
        'num_private': int,
        'region_code': int,
        'district_code': int,
        'population': int,
        'public_meeting': bool_converter,
        'construction_year': int,
        'status_group': status_converter
    }

    facility_code = "wpf001"

    with open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            print "Adding line", i + skip + 2

            try:
                d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
                coords = [d.pop('longitude'), d.pop('latitude')]
                d['location'] = {'type': 'Point', 'coordinates': coords}
                d['facility_code'] = facility_code
                if not check(add_document('waterpoints', d)):
                    raise Exception()

            except Exception as e:
                print "Error adding waterpoint", e
                pprint(d)
                exit()

            if limit and i >= limit:
                break
    # Create a 2dsphere index on the location field for geospatial queries
    app.data.driver.db['facilities'].create_index([('location', '2dsphere')])
Example #9
0
 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
              dialect="excel", encoding='utf-8', *args, **kw):
     DictReader.__init__(self, f, fieldnames=fieldnames,
                         restkey=restkey, restval=restval,
                         dialect=dialect, *args, **kw)
     if not encoding is None:
         f = Utf8Recoder(f, encoding=encoding)
     # Replace the reader with our unicode-enabled reader.
     self.reader = reader(f, dialect=dialect, *args, **kw)
def extractThresholdValues(fname):
    # parse csv file and add threshold values as dict
    # this method might be called multiple times for one item

    # There are various formats:
    #   combined.modelEvaluation: Threshold Name, Testing.data, Cutoff,
    #                             Sensitivity, Specificity
    #   biomod2.modelEvaluation: Threshold Name, Testing.data, Cutoff.*,
    #                            Sensitivity.*, Specificity.*
    #   maxentResults.csv: Species,<various columns with interesting values>
    #                <threshold name><space><cumulative threshold,
    #                              logistic threshold,area,training omission>
    # FIXME: this is really ugly and csv format detection should be done
    #        differently
    thresholds = {}
    if fname.endswith("maxentResults.csv"):
        csvfile = open(fname, "r")
        dictreader = DictReader(csvfile)
        row = dictreader.next()
        # There is only one row in maxentResults
        namelist = (
            "Fixed cumulative value 1",
            "Fixed cumulative value 5",
            "Fixed cumulative value 10",
            "Minimum training presence",
            "10 percentile training presence",
            "10 percentile training presence",
            "Equal training sensitivity and specificity",
            "Maximum training sensitivity plus specificity",
            "Balance training omission, predicted area and threshold value",
            "Equate entropy of thresholded and original distributions",
        )
        for name in namelist:
            # We extract only 'cumulative threshold'' values
            threshold = "{} cumulative threshold".format(name)
            thresholds[threshold] = Decimal(row[threshold])
    else:
        # assume it's one of our biomod/dismo results
        csvfile = open(fname, "r")
        dictreader = DictReader(csvfile)
        # search the field with Cutoff
        name = "Cutoff"
        for fieldname in dictreader.fieldnames:
            if fieldname.startswith("Cutoff."):
                name = fieldname
                break
        try:
            for row in dictreader:
                try:
                    thresholds[row[""]] = Decimal(row[name])
                except (TypeError, InvalidOperation) as e:
                    LOG.warn(
                        "Couldn't parse threshold value '%s' (%s) from" "file '%s': %s", name, row[name], fname, repr(e)
                    )
        except KeyError:
            LOG.warn("Couldn't extract Threshold '%s' from file '%s'", name, fname)
    return thresholds
Example #11
0
def locations(rack_locations_path=RACKS_LOCATION_CSV):
    with open(rack_locations_path, 'r') as file:
        csv_file = DictReader(file,
                              ["latitude", "longitude", "icon", "desc", "racks_count", "parking_places"])
        acc = []
        csv_file.__next__()  # Skip the header
        for attributes in csv_file:
            acc.append(RacksLocation(attributes))

        return acc
Example #12
0
 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
              dialect="excel",
              encoding=None,
              *args, **kwds):
     BaseDictReader.__init__(self, f=f, fieldnames=fieldnames,
                             restkey=restkey, restval=restval,
                             dialect=dialect,
                             *args, **kwds)
     from .csv import reader
     self.reader = reader(f, dialect=dialect,
                          encoding=encoding,
                          **kwds)
Example #13
0
  def __init__(self, csv):
    self.bag = Counter()
    reader = DictReader(open(csv, 'r'), fieldnames=[
      "TileFile", "Borders", "Quantity", "Features", "Notes"])
    reader.next()  # skip header, we've defined our own

    for tile_dict in reader:
      tile = Tile.from_csv(tile_dict)
      quantity = int(tile_dict["Quantity"].strip())
      self.bag[tile] = quantity
      if "B" in tile_dict["Features"]:
        self.first_tile = tile
Example #14
0
def number1():
    filename = '/home/apt9online/src/bslcks/jtest.csv'
    cong = DictReader(open(filename))

    while True:
        p = cong.next()
        print cong.line_num
        if p['Include on directory'] == 'Yes':
          if p['Family relation'] <> 'Duplicate':
            try:
                Person.objects.get(bslc_individual=p['Indiv #'])
                print "%s %s already exists in the DB" % (p['First name'],p['Last name'])
            except:
                record_person(p)
Example #15
0
def csvInput(file,options,dialect='excel'):
    header=options['header']
    from csv import DictReader
    with open(file,'r') as f:
        if not header:
            reader = DictReader(f,dialect=dialect)
        else:
            reader = DictReader(f,dialect=dialect,fieldnames=header.split(','))
        reader.fieldnames = map(options['alias'],reader.fieldnames)
        entries =[line for line in reader]
        map(lambda(dict):
                dict.update({"file":file,
                             "format":fileType(file)}),
            entries)
        return entries
Example #16
0
    def test_write_csv(self):
        """TestBase: Base::write_csv() creates a valid csv"""
        from csv import DictReader

        fname = "thermal.csv"
        trappy.Run().thermal.write_csv(fname)

        with open(fname) as fin:
            csv_reader = DictReader(fin)

            self.assertTrue("Time" in csv_reader.fieldnames)
            self.assertTrue("temp" in csv_reader.fieldnames)

            first_data = csv_reader.next()
            self.assertEquals(first_data["Time"], "0.0")
            self.assertEquals(first_data["temp"], "68786")
Example #17
0
 def __init__(self, csvfile, fields=None, silent=False, **kwargs):
   self.csvfile = csvfile
   self.rows_imported = 0
   self.errors = []
   self.silent = silent
   if fields:
     if isinstance(fields[0], (list, tuple)):
       kwargs['fieldnames'] = [field[0] for field in fields]
       self.field_types = dict(fields)
     else:
       kwargs['fieldnames'] = fields
       self.field_types = dict.fromkeys(fields, None)
     DictReader.__init__(self, csvfile, **kwargs)
   else:
     DictReader.__init__(self, csvfile, **kwargs)
     self.field_types = dict.fromkeys(self.fieldnames, None)
Example #18
0
def graceful_read_csv(filename):
	from csv import DictReader

	data = []
	try:
		f = open(filename, 'rb')
	except IOError as e:
		print( "ERROR:", e.strerror )
		exit()

	csvreader = DictReader(f)
	while True:
		try: row = csvreader.next()
		except: break
		data.append(row)

	return data
Example #19
0
 def __init__(self, filename, container = None, dialect = 'simplecsv'):
     self._container = None
     if isinstance(container, ObjectContainer):
         self._container = container
         self._reader = DictReader(filename, fieldnames = None, restkey = "restkey", restval = "restval", dialect = dialect)
     elif isinstance(container, TupleContainer) or isinstance(container, ListContainer):
         self._container = container
         self._reader = csv.reader(filename, dialect = dialect)
     else:
         raise Exception("Given container is not valid")
Example #20
0
File: utils.py Project: squioc/gtfs
class CSVUnicodeReader(object):
    def __init__(self, stream):
        self.reader = DictReader(UTF8Encoder(stream))

    def __iter__(self):
        return self

    def next(self):
        entry = self.reader.next()
        return dict([(unicode(k, "utf-8"), unicode(v, "utf-8")) for (k,v) in entry.items()])
Example #21
0
def test_append_field_err(form_config, form_data, log_path):
    """ Checks that error logs are correctly written and appended

    Submits three forms, the second two have different fields to the first
    and should be added to the same log file as each other, and be identical
    """
    formmail.log_formdata(form_data, log_path)
    del form_data['email']

    # submit two forms with fields that dont match the config
    # this should append the second form to the error log file
    with pytest.raises(Exception):
        formmail.log_formdata(form_data, log_path)
    with pytest.raises(Exception):
        formmail.log_formdata(form_data, log_path)

    with open(log_path + '_error') as error_log:
        reader = DictReader(error_log)
        assert reader.next() == form_data
        assert reader.next() == form_data
Example #22
0
    def clean_csv(self):
        """ ueberprueft, ob eine gueltige CSV-Datei hochgeladen wurde """

        # erster Test: ist der Content-Type gueltig 
        csv = self.cleaned_data['csv']
        if csv.content_type != 'text/csv':
            self._errors['csv'] = self.error_class(['Nur CSV-Dateien sind als Eingabe erlaubt!'])
            return csv
    
        # zweiter Test: hat die Datei die richtige Anzahl Spalten?
        reader = DictReader(csv)
        try:
            entry = reader.next()
            if len(entry) != 12:
                msg = 'Ungültiges Format der CSV-Datei (falsche Anzahl Spalten)!'
                self._errors['csv'] = self.error_class([msg])
        except StopIteration:
            msg = 'Ungültiges Format der CSV-Datei (keine Bestellungen vorhanden)!'
            self._errors['csv'] = self.error_class([msg])
        orders = [entry] + [row for row in reader]
        return orders
Example #23
0
def upload_waterpoints(filename, skip=0, limit=None):
    """Upload waterpoints from a gzipped CSV file."""
    convert = {
        'date_recorded': lambda s: datetime.strptime(s, '%m/%d/%Y'),
        'population': int,
        'construction_year': lambda s: datetime.strptime(s, '%Y'),
        'breakdown_year': lambda s: datetime.strptime(s, '%Y'),
        'amount_tsh': float,
        'gps_height': float,
        'latitude': float,
        'longitude': float,
    }
    with gzip.open(filename) as f:
        reader = DictReader(f)
        for i in range(skip):
            reader.next()
        for i, d in enumerate(reader):
            d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v)
            d['facility_code'] = 'wpf001'
            check(add_document('waterpoints', d))
            if limit and i >= limit:
                break
Example #24
0
def test_success(field, expected, log_path, response_for, form_data, sm_mock):
    key, value = field
    form_data[key] = value
    assert response_for(form_data, log=False) == expected
    assert sm_mock.call_count == 1

    params = sm_mock.call_args[0][1]['fields']
    assert set(params.keys()) == set(form_data.keys())
    for key, value in form_data.items():
        assert params[key] == value.decode('utf8')

    assert response_for(form_data, log=True) == expected
    assert sm_mock.call_count == 2

    assert response_for(form_data, log=True) == expected
    assert sm_mock.call_count == 3

    with open(log_path) as log_file:
        reader = DictReader(log_file)
        row = reader.next()
        # rows should not be equal because the time field
        # is added by the logging function.
        assert row != reader.next()
Example #25
0
 def next(self):
   row = DictReader.next(self)
   try:
     processed_row = dict(
         (key, convert(value, self.field_types[key]))
         for key, value in row.iteritems()
     )
   except ValueError as e:
     self.errors.append((e, row))
     if not self.silent:
       raise e
   else:
     self.rows_imported += 1
     return processed_row
Example #26
0
class StructuredReader(object):
    def __init__(self, filename, container = None, dialect = 'simplecsv'):
        self._container = None
        if isinstance(container, ObjectContainer):
            self._container = container
            self._reader = DictReader(filename, fieldnames = None, restkey = "restkey", restval = "restval", dialect = dialect)
        elif isinstance(container, TupleContainer) or isinstance(container, ListContainer):
            self._container = container
            self._reader = csv.reader(filename, dialect = dialect)
        else:
            raise Exception("Given container is not valid")

    def next(self):
        # do not treat the header row
        if self._reader.line_num == 0:
            self._reader.next()

        row = self._reader.next()
        return self._container.fetch(row)


    def __iter__(self):
        return self
Example #27
0
def dictcsv(csvname, fieldnames = None, arrays = False):
    """Reading csv files into a dictionary.

    Arguments:
    csvname: string filename
    
    Keyword Arguments:
    fieldnames: list of csv column names.  If none, first column of the file
                being read will be used.
    arrays: Whether or not to return csv contents as a dict of arrays

    Returns: 
    dictionary of columns as numpy arrays, keys are fieldnames    
    """

    fileobj = open(csvname, 'rU')
    DR = DictReader(fileobj, fieldnames = fieldnames)
    
    fields = DR.fieldnames
    l = DR.next()
    dicty = {}
    for f in fields:
        try:
            dicty[f] = [float(l[f])]
        except (TypeError, ValueError):
            dicty[f] = [l[f]]
    for row in DR:
        for f in fields:
            try:
                dicty[f].append(float(row[f]))
            except (TypeError, ValueError):
                dicty[f].append(row[f])
    if arrays:
        for key in dicty:
            dicty[key] = np.array(dicty[key])
            
    return dicty
Example #28
0
    def __init__(self, csvfile, casts, fieldnames=None, restkey=None, 
                 restval=None, dialect='excel', *args, **kwds):
        """Arguments:
           - f: An iterable object such as as file. Passed on to 
             csv.DictReader
           - casts: A dict mapping from attribute names to functions to apply
             to these names, e.g., {'id':int, 'salary':float}
           - fieldnames: Passed on to csv.DictReader
           - restkey: Passed on to csv.DictReader
           - restval: Passed on to csv.DictReader
           - dialect: Passed on to csv.DictReader
           - *args: Passed on to csv.DictReader
           - **kwds: Passed on to csv.DictReader
        """
        DictReader.__init__(self, csvfile, fieldnames=fieldnames, 
                            restkey=restkey, restval=restval, dialect=dialect, 
                            *args, **kwds)

        if not type(casts) == dict:
            raise TypeError("The casts argument must be a dict")
        for v in casts.values():
            if not callable(v):
                raise TypeError("The values in casts must be callable")
        self._casts = casts
Example #29
0
 def __init__(self, fnm_in='input.csv', fnm_out='output.csv', restkey=None, restval=None,
               dialect_in="excel", dialect_out="excel"):
     self.f_in = open(fnm_in)
     self.csv_dict_reader = DictReader(self.f_in, restkey=restkey, restval=restval, dialect=dialect_in)
     field_names = self.csv_dict_reader.fieldnames
     if len(field_names) <> len(self.defined_input_field_names):
         raise ValueError,\
               ("incorrect number of columns in the file %s, it should have %d columns" %
                (fnm_in, len(self.defined_input_field_names)))
     if [1 for x in zip(field_names,self.defined_input_field_names) if x[0] != x[1]]:
         raise ValueError,\
           ("incorrect names of columns in the file %s, they should be %s" %
           (fnm_in,'"{0}"'.format('","'.join(x for x in self.defined_input_field_names))))
     self.f_out = open(fnm_out, 'w')
     self.csv_dict_writer = DictWriter(self.f_out, self.defined_output_field_names, dialect=dialect_out)
Example #30
0
class BaseCSVHandler(object):
    defined_input_field_names = ['date','customer','money']
    defined_output_field_names = ['date','customer','money']
    result = []

    def __init__(self, fnm_in='input.csv', fnm_out='output.csv', restkey=None, restval=None,
                  dialect_in="excel", dialect_out="excel"):
        self.f_in = open(fnm_in)
        self.csv_dict_reader = DictReader(self.f_in, restkey=restkey, restval=restval, dialect=dialect_in)
        field_names = self.csv_dict_reader.fieldnames
        if len(field_names) <> len(self.defined_input_field_names):
            raise ValueError,\
                  ("incorrect number of columns in the file %s, it should have %d columns" %
                   (fnm_in, len(self.defined_input_field_names)))
        if [1 for x in zip(field_names,self.defined_input_field_names) if x[0] != x[1]]:
            raise ValueError,\
              ("incorrect names of columns in the file %s, they should be %s" %
              (fnm_in,'"{0}"'.format('","'.join(x for x in self.defined_input_field_names))))
        self.f_out = open(fnm_out, 'w')
        self.csv_dict_writer = DictWriter(self.f_out, self.defined_output_field_names, dialect=dialect_out)

    def __iter__(self):
        return self

    def one_string_handler(self,s):
        if s: self.result.append (s)

    def next(self):
        return self.csv_dict_reader.next()

    def calc_result(self):
        pass

    def write_result(self):
        self.csv_dict_writer.writeheader()
        self.csv_dict_writer.writerows(self.result)

    def close_all_files(self):
        self.f_in.close()
        self.f_out.close()
        self.csv_dict_writer = None
        self.csv_dict_reader = None

    def process_all(self):
        for i in self: self.one_string_handler(i)
        self.calc_result()
        self.write_result()
        self.close_all_files()
Example #31
0
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="output file")
    parser.add_argument('--subsample',
                        type=float,
                        default=1.0,
                        help='subsample this fraction of total')
    args = parser.parse_args()
    trainfile = prepfile(args.trainfile, 'r')
    if args.testfile is not None:
        testfile = prepfile(args.testfile, 'r')
    else:
        testfile = None
    outfile = prepfile(args.outfile, 'w')
    # Create feature extractor (you may want to modify this)
    trainn = DictReader(trainfile, delimiter='\t')

    fe = FeatureExtractor()
    # Read in training data
    train = DictReader(trainfile, delimiter='\t')

    pss = PorterStemmer()

    # Split off dev section
    dev_train = []
    dev_test = []
    full_train = []
    p = 0
    cc = 0
    s1, s2, s3, s4, s5 = 0, 0, 0, 0, 0
    b1, b2, b4, b3, b5 = 0, 0, 0, 0, 0
Example #32
0
def combine_split_chained_results(output_prefixes, final_prefix, ref_gff,
                                  ref_group, ref_name, ref_fq, addon_gff,
                                  addon_group, addon_name, addon_fq):
    """
    Each <output_prefix> will have .gff, .group.txt, .mega_info.txt.
    There should be NO overlap between the split files, so clean merge should be possible!

    1. read the .gff files, record the group and mega (id-map) info
    2. sort the total records so can properly put on a unified superPBID
    3. write out the unified result
    4. delete the split files
    """

    # sanity check files are all there
    split_files = []  # tuple of (gff, group, mega)
    for ref_name, o in output_prefixes:
        gff_file = 'tmp_' + o + '.gff'
        mega_file = 'tmp_' + o + '.mega_info.txt'
        group_file = 'tmp_' + o + '.group.txt'
        if not os.path.exists(gff_file) or not os.path.exists(
                mega_file) or not os.path.exists(group_file):
            print(
                "Expects to see {0},{1},{2} but one or more files are missing! Abort!"
                .format(gff_file, mega_file, group_file),
                file=sys.stderr)
            sys.exit(-1)
        split_files.append((ref_name, o, gff_file, group_file, mega_file))

    use_fq = False
    if ref_fq is not None and addon_fq is not None:
        use_fq = True
        ref_fq_dict = dict((r.id.split('|')[0], r)
                           for r in SeqIO.parse(open(ref_fq), 'fastq'))
        addon_fq_dict = dict((r.id.split('|')[0], r)
                             for r in SeqIO.parse(open(addon_fq), 'fastq'))

    mega_info = {}  # ref id -> list of matching query_id, or empty list
    split_unmatched = set()

    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        for r in DictReader(open(mega_file), delimiter='\t'):
            if r[ref_name] != 'NA':
                if r[ref_name] not in mega_info:
                    mega_info[r[ref_name]] = []
                if r[split_name] != 'NA':
                    mega_info[r[ref_name]].append(r[split_name])
            else:  # ref is NA, non-ref is not NA
                split_unmatched.add(r[split_name])

    # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both
    rec_list = []
    d_ref = dict((r.seqid, r) for r in GFF.collapseGFFReader(ref_gff))
    d_addon = dict((r.seqid, r) for r in GFF.collapseGFFReader(addon_gff))

    ref_group_info = sp.MegaPBTree.read_group(ref_group, None)
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)

    for ref_id, matches in mega_info.items():
        if len(matches) == 0:
            rec_list.append(
                sp.MatchRecord(ref_id=ref_id,
                               addon_id='NA',
                               rec=d_ref[ref_id],
                               members=ref_group_info[ref_id],
                               seqrec=ref_fq_dict[ref_id] if use_fq else None))
        else:
            for addon_id in matches:
                r1 = d_ref[ref_id]
                r2 = d_addon[addon_id]
                if (r1.end - r1.start) > (r2.end - r2.start):
                    rec_list.append(
                        sp.MatchRecord(
                            ref_id=ref_id,
                            addon_id=addon_id,
                            rec=r1,
                            members=ref_group_info[ref_id] +
                            addon_group_info[addon_id],
                            seqrec=ref_fq_dict[ref_id] if use_fq else None))
                else:
                    rec_list.append(
                        sp.MatchRecord(ref_id=ref_id,
                                       addon_id=addon_id,
                                       rec=r2,
                                       members=ref_group_info[ref_id] +
                                       addon_group_info[addon_id],
                                       seqrec=addon_fq_dict[addon_id]
                                       if use_fq else None))
    for addon_id in split_unmatched:
        rec_list.append(
            sp.MatchRecord(ref_id='NA',
                           addon_id=addon_id,
                           rec=d_addon[addon_id],
                           members=addon_group_info[addon_id],
                           seqrec=addon_fq_dict[addon_id] if use_fq else None))

    sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name,
                                   addon_name, use_fq)
    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        os.remove(gff_file)
        os.remove(group_file)
        os.remove(mega_file)
from glob import glob
import re
from csv import DictReader
from collections import defaultdict


def inner_dict_init():
    return defaultdict(float)


protocols = defaultdict(inner_dict_init)

for infile in glob('TLS1_bank/*'):
    with open(infile, 'r') as of:
        reader = DictReader(of)
        for row in reader:
            for p in row:
                if not p: continue
                item = row[p]
                cipher, value = item.split()
                cipher = cipher.translate(None, "{'}:")
                value = value.translate(None, "{}, ''")
                protocols[p][cipher] += float(value)

# print protocols
res = {}
for p in protocols:
    a = sorted(protocols[p], key=lambda x: protocols[p][x], reverse=True)
    res[p] = [(i, protocols[p][i]) for i in a][:5]
    res[p].append(('other', sum([protocols[p][i] for i in a][5:])))
Example #34
0
    def read_raven_selection_table(cls, tbl_path):
        '''
        Given the path to a Raven selection table
        .csv file, return a list of dicts. Each dict
        contains the information in one selection table
        row, plus two additional entries: time_interval,
        and freq_interval; these are added for convenience:
        
        Selection           row number <int>
        View                not used
        Channel             not used
        Begin Time (s)      begin of vocalization in fractional seconds <float>
        End Time (s)        end of vocalization in fractional seconds <float>
        Low Freq (Hz)       lowest frequency within the lassoed vocalization <float>
        High Freq (Hz)      highest frequency within the lassoed vocalization <float>
        species             four-letter species name <str>
        type                {song, call, call-1, call-trill} <str>
        number              not used
        mix                 comma separated list of other audible species [<str>]
        
        time_interval       Inteval instance from start and end times
        freq_interval       Inteval instance from start and end frequencies
        
        Values are converted to appropriate types as above.
        Output is suitable for SnippetSelectionTableMapper.match_snippets()
        
        The list will be sorted by ascending the 'Begin Time (s)'
        value.

        :param tbl_path: full path to Raven selection table
        :type tbl_path: src
        :return: list of dicts, each dict reflecting the content
            of one selection table row
        :rtype [str : Any]
        '''
        with open(tbl_path, 'r') as sel_tbl_fd:
            reader = DictReader(sel_tbl_fd, delimiter='\t')
            sel_dict_list = [row_dict for row_dict in reader]

        # Coerce types:
        for sel_dict in sel_dict_list:
            sel_dict['Selection'] = str(sel_dict['Selection'])
            sel_dict['Begin Time (s)'] = float(sel_dict['Begin Time (s)'])
            sel_dict['End Time (s)'] = float(sel_dict['End Time (s)'])
            sel_dict['Low Freq (Hz)'] = float(sel_dict['Low Freq (Hz)'])
            sel_dict['High Freq (Hz)'] = float(sel_dict['High Freq (Hz)'])
            # Turn the comma-separated list of
            # overlapping vocalizations into
            # a (possibly empty) list of strings:
            sel_dict['mix'] = [] if len(
                sel_dict['mix']) == 0 else sel_dict['mix'].split(',')
            # Clean out spurious white space:
            sel_dict['mix'] = [
                mix_list_entry.strip() for mix_list_entry in sel_dict['mix']
            ]
            # Remove empty mixes:
            #****
            sel_dict['time_interval'] = Interval(sel_dict['Begin Time (s)'],
                                                 sel_dict['End Time (s)'])
            sel_dict['freq_interval'] = Interval(sel_dict['Low Freq (Hz)'],
                                                 sel_dict['High Freq (Hz)'])

        # Make sure the list is sorted by
        # ascending start time:
        sel_dict_list_sorted = natsorted(
            sel_dict_list, key=lambda row_dict: row_dict['Begin Time (s)'])
        return sel_dict_list_sorted
Example #35
0
def chain_samples_multithread(dirs,
                              names,
                              group_filename,
                              gff_filename,
                              count_filename,
                              field_to_use='count_fl',
                              fuzzy_junction=0,
                              allow_5merge=False,
                              max_3_diff=100,
                              fastq_filename=None,
                              cpus=4):
    for d in dirs.values():
        sample_sanity_check(os.path.join(d, group_filename),\
                            os.path.join(d, gff_filename),\
                            os.path.join(d, count_filename),\
                            os.path.join(d, fastq_filename) if fastq_filename is not None else None)

    count_header, count_info = read_count_info(count_filename, dirs,
                                               field_to_use)

    # some names may already start with "tmp_" which means they are intermediate results that have already been chained
    # find the first non "tmp_" and start from there
    if names[0].startswith('tmp_'):
        chain = []
        for start_i, name in enumerate(names):
            if name.startswith('tmp_'):
                chain.append(name[4:])
            else:
                break
        # start_i, name now points at the first "non-tmp" sample
        # we want to go to the last tmp_ sample and read it
        name = names[start_i -
                     1][4:]  # this is the last tmp_ sample, let's read it
        first_add = False
    else:  # everything is new, start fresh
        name = names[0]
        chain = [name]
        start_i = 1
        first_add = True

    for addon_name in names[start_i:]:
        assert not addon_name.startswith('tmp_')
        ref_name = chain[-1]
        ref_d = dirs[ref_name]
        if first_add:
            ref_gff = os.path.join(ref_d, gff_filename)
            ref_group = os.path.join(ref_d, group_filename)
            ref_fq = os.path.join(
                ref_d, fastq_filename) if fastq_filename is not None else None
        else:
            ref_name = 'tmp_' + ref_name
            ref_gff = ref_name + '.gff'
            ref_group = ref_name + '.group.txt'
            ref_fq = ref_name + '.rep.fq' if fastq_filename is not None else None
        addon_d = dirs[addon_name]
        addon_gff = os.path.join(addon_d, gff_filename)
        addon_group = os.path.join(addon_d, group_filename)
        addon_fq = os.path.join(
            addon_d, fastq_filename) if fastq_filename is not None else None
        split_outs, split_ins = chain_split_file(ref_gff=ref_gff,
                                                 ref_group=ref_group,
                                                 ref_name=ref_name,
                                                 addon_gff=addon_gff,
                                                 addon_group=addon_group,
                                                 addon_name=addon_name,
                                                 fuzzy_junction=fuzzy_junction,
                                                 allow_5merge=allow_5merge,
                                                 max_3_diff=max_3_diff,
                                                 n_chunks=cpus)

        combine_split_chained_results(split_outs,
                                      final_prefix='tmp_' + addon_name,
                                      ref_gff=ref_gff,
                                      ref_group=ref_group,
                                      ref_name=ref_name,
                                      ref_fq=ref_fq,
                                      addon_gff=addon_gff,
                                      addon_group=addon_group,
                                      addon_name=addon_name,
                                      addon_fq=addon_fq)

        chain.append(addon_name)
        for in_gff_split, in_group_split in split_ins:
            os.remove(in_gff_split)  # remove the split gff
            os.remove(in_group_split)

        first_add = False

    # now recursively chain back by looking at mega_info.txt!!!
    d = {}  # ex: (tmp_sample1, PB.1.1) --> mega info dict
    for c in chain[1:]:
        for r in DictReader(open('tmp_' + c + '.mega_info.txt'),
                            delimiter='\t'):
            d['tmp_' + c, r['superPBID']] = r

    f1 = open('all_samples.chained_ids.txt', 'w')
    writer1 = DictWriter(f1, fieldnames=['superPBID'] + chain, delimiter='\t')
    writer1.writeheader()
    f2 = open('all_samples.chained_count.txt', 'w')
    writer2 = DictWriter(f2, fieldnames=['superPBID'] + chain, delimiter='\t')
    writer2.writeheader()

    reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'),
                        delimiter='\t')
    for r in reader:
        saw_NA = False
        r0 = r
        answer = defaultdict(lambda: 'NA')  # ex: 1009 --> PB.1.1
        answer2 = defaultdict(lambda: 'NA')  # ex: 1009 --> count
        answer[chain[-1]] = r[chain[-1]]
        if r[chain[-1]] != 'NA':
            answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]]
        for c in chain[::-1][
                1:
                -1]:  # the first sample does not have tmp_, because it's not a chain
            if r['tmp_' + c] == 'NA':
                saw_NA = True
                break
            else:
                r2 = d['tmp_' + c, r['tmp_' + c]]
                answer[c] = r2[c]
                if answer[c] != 'NA':
                    answer2[c] = count_info[c, answer[c]]
                r = r2
        if not saw_NA:
            answer[chain[0]] = r[chain[0]]
            if answer[chain[0]] != 'NA':
                answer2[chain[0]] = count_info[chain[0], answer[chain[0]]]

        rec1 = {'superPBID': r0['superPBID']}
        rec2 = {'superPBID': r0['superPBID']}
        for c in chain:
            rec1[c] = answer[c]
            rec2[c] = str(answer2[c])
        writer1.writerow(rec1)
        writer2.writerow(rec2)
    f1.close()
    f2.close()

    shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff')
    if fastq_filename is not None:
        shutil.copyfile('tmp_' + chain[-1] + '.rep.fq',
                        'all_samples.chained.rep.fq')

    print("Chained output written to:", file=sys.stdout)
    print("all_samples.chained.gff", file=sys.stdout)
    print(f1.name, file=sys.stdout)
    print(f2.name, file=sys.stdout)
    if fastq_filename is not None:
        print("all_samples.chained.rep.fq", file=sys.stdout)
Example #36
0
def readCSV(filename):
    with open(filename) as inFile:
        drObject = DictReader(inFile)
        return list(drObject)
Example #37
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('performance_files', type=Path, nargs='+')
    parser.add_argument('--output-dir', type=Path)
    args = parser.parse_args()

    performance = []
    for f in args.performance_files:
        with open(f) as fp:
            performance.extend(list(DictReader(fp)))
    cols = set()
    for p in performance:
        cols.update(p.keys())

    data_dicts = defaultdict(list)
    for p in performance:
        for c in cols:
            v = p.get(c, None)
            data_dicts[c].append(v)

    df = pd.DataFrame(data_dicts)

    nwp_models = df['nwp_model'].unique()
    mae = df['mean_absolute_error']
    df['mean_absolute_error'] = pd.to_numeric(df['mean_absolute_error'],
                                              errors='coerce')
    df = df.dropna(axis=0, how='any', subset=['mean_absolute_error'])
    mae_min = df['mean_absolute_error'].min()
    mae_max = df['mean_absolute_error'].max()
    models = df['model'].unique()
    fig, axes = plt.subplots(len(nwp_models),
                             len(models),
                             sharex='col',
                             squeeze=False,
                             sharey='all')
    for i, nwp_model in enumerate(sorted(nwp_models)):
        for j, model in enumerate(sorted(models)):
            ax = axes[i, j]
            model_df = df.loc[np.logical_and(df['nwp_model'] == nwp_model,
                                             df['model'] == model)]
            try:
                sns.boxplot(data=model_df,
                            x='fold_id',
                            y='mean_absolute_error',
                            ax=ax)
            except ValueError:
                continue
            ax.set_title(nwp_model)
            if i == len(nwp_models) - 1:
                ax.set_xlabel('Test fold id')
            else:
                ax.set_xlabel('')
            if i == 0:
                ax.text(0.5,
                        1.3,
                        model,
                        horizontalalignment='center',
                        verticalalignment='center',
                        transform=ax.transAxes,
                        fontdict=dict(size=18))
            if j == 0:
                ax.set_ylabel('Mean absolute error')
            else:
                ax.set_ylabel('')

    if args.output_dir is not None:

        save_path = args.output_dir / f'nwp_compare.png'
        plt.savefig(save_path)
    plt.show()
Example #38
0
def read_content(filename):
    with open(filename, newline='') as fp:
        reader = DictReader(fp)
        return list(reader)
  'ces': 'Czech', 'deu': 'German', 'fra': 'French', 'fre': 'French',
  'esn': 'Spanish', 'fin': 'Finnish', 'rus': 'Russian', 'hin': 'Hindi',
  'eng': 'English', 'ron': 'Romanian', 'tur': 'Turkish',
  'Portguese': 'Portuguese'
}

if __name__ == "__main__":
    args = PARSER.parse_args()
    
    if not args.inter_annotator_agreement and \
      not args.intra_annotator_agreement:
        print("Defaulting to --inter mode.")
        args.inter_annotator_agreement = True
    
    results_data = defaultdict(lambda: defaultdict(list))
    for i, row in enumerate(DictReader(args.results_file)):
        src_lang = row.get('srclang')
        if src_lang in list(LANGUAGE_CODE_TO_NAME.keys()):
            src_lang = LANGUAGE_CODE_TO_NAME[src_lang]
        
        trg_lang = row.get('trglang')
        if trg_lang in list(LANGUAGE_CODE_TO_NAME.keys()):
            trg_lang = LANGUAGE_CODE_TO_NAME[trg_lang]
        
        language_pair = '{0}-{1}'.format(src_lang, trg_lang)
        segment_id = int(row.get('srcIndex'))
        judge_id = row.get('judgeId')
        if not judge_id:
            judge_id = row.get('judgeID')
        
        # Filter out results where a user decided to "skip" ranking.
        a = sorted(lst, key=lambda x: x[1])

        return a[1:16]


dct = {
    'AS': 'Asia',
    'EU': 'Europe',
    'AF': 'Africa',
    'OC': 'Oceania',
    'NA': 'North America',
    'AN': 'Antarctica',
    'SA': 'South America'
}
with open('countries.csv') as f:
    reader = list(DictReader(f))
    for i in reader:
        with open('%s.html' % i['short_name'], 'w') as myFile:
            myFile.write('<html>\n')
            myFile.write('<head>\n')
            myFile.write('\t<title>%s</title>\n' % i['name'])
            myFile.write('</head>\n')

            a = i['short_name']
            myFile.write(
                '<img src="http://www.crwflags.com/fotw/images/{}/{}.gif">\n'.
                format(a[0].lower(), a.lower()))

            myFile.write('<h1>%s</h1>\n' % i['name'])
            myFile.write('<dl>\n')
            myFile.write('\t<dt>Capital</dt>\n')
Example #41
0
from csv import DictReader

data_rdr = DictReader(open('../data-wrangling/data/unicef/mn.csv', 'rt'))
header_rdr = DictReader(
    open('../data-wrangling/data/unicef/mn_headers.csv', 'rt'))

data_rows = [d for d in data_rdr]
header_rows = [h for h in header_rdr]
new_rows = []

for data_dict in data_rows:
    new_row = {}
    for dkey, dval in data_dict.items():
        for header_dict in header_rows:
            if dkey in header_dict.values():
                new_row[header_dict.get('Label')] = dval
    new_rows.append(new_row)

print(new_rows[0])
Example #42
0
or implied.
'''

import requests, yaml, json
from flask import Flask, request, redirect, url_for, render_template
from csv import DictReader

cred = yaml.safe_load(open("credentials.yml"))
WT_ADMIN_TOKEN = cred['WT_ADMIN_TOKEN']
MERAKI_KEY = cred['MERAKI_KEY']
MERAKI_NETWORK_ID = cred['MERAKI_NETWORK_ID']

# Room Data, from csv file
MerakiCamera_to_WebexRoomKitMini = []
with open('MerakiCameras_to_WebexRoomKitMini_Pairing.csv', 'r') as read_obj:
    csv_dict_reader = DictReader(read_obj)
    for row in csv_dict_reader:
        row['Room_Name'] = row.pop('Room_Name')
        MerakiCamera_to_WebexRoomKitMini.append(row)
# note the use case is developed with only one patient room available, requires iterations if multiple rooms are listed in MerakiCameras_to_WebexRoomKitMini_Pairing.csv
MERAKI_SN = MerakiCamera_to_WebexRoomKitMini[0]['Meraki_SN']
ROOM_NAME = MerakiCamera_to_WebexRoomKitMini[0]['Room_Name']
ROOMKIT_ID = MerakiCamera_to_WebexRoomKitMini[0]['Webex_RoomKitMini_ID']
SIP_URL = MerakiCamera_to_WebexRoomKitMini[0]['Webex_RoomKitMini_SIP']

app = Flask(__name__)


@app.route('/')
def pop_up():
    snapshot_url = "https://api.meraki.com/api/v0/networks/{1}/cameras/{0}/snapshot".format(
Example #43
0
from csv import DictReader
import pickle

data = list(DictReader(open("diabetes.csv", "rt")))

data1 = open("diabetes.pkl", "wb")
pickle.dump(data, data1)

data1.close()

data1 = open("diabetes.pkl", "rb")
pickle.load(data1)

for i in data:
    print(i)
Example #44
0
#!/usr/bin/python

import model
from model import db
from io import open
from csv import DictReader

db.drop_all()
db.create_all()

with open('data/movies.csv', 'r', encoding='utf-8-sig') as movies_file:
    reader = DictReader(movies_file)
    for row in reader:
        new_movie = model.Movie(name=row['name'], year=row['year'])

        actors = row['actors'].split(';')
        for actor in actors:
            print(actor)
            existing_actor = model.Actor.query.filter_by(name=actor).first()
            if (existing_actor):
                existing_actor.movies.append(new_movie)
                new_movie.actors.append(existing_actor)
            else:
                new_actor = model.Actor(name=actor)
                new_actor.movies.append(new_movie)
                new_movie.actors.append(new_actor)
                db.session.add(new_actor)

        db.session.add(new_movie)

with open('data/songs.csv', 'r', encoding='utf-8-sig') as songs_file:
Example #45
0
 def _detect_header(self):
     with open(self._file) as csv_file:
         reader = DictReader(csv_file)
         self.header = reader.fieldnames
Example #46
0
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import os
import re
import nltk
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# In[3]:

#load train data
f_train_bodies = open(data_path + '/' + 'x_train.csv', 'r', encoding='utf-8')
X_train_csv = DictReader(f_train_bodies)
X_train_data = list(X_train_csv)

col_name = ['Headline', 'Body', 'Stance']
X_train_headline = [x[col_name[0]] for x in X_train_data]
X_train_body = [x[col_name[1]] for x in X_train_data]
Y_train = [y[col_name[2]] for y in X_train_data]
y_train = Y_train + Y_train

# In[4]:

# Read the text files of fnc data
X_train_df = pd.read_csv(data_path + '/' + 'x_train.csv')
X_test_df = pd.read_csv(data_path + '/' + 'x_test.csv')

# In[5]:
Example #47
0
def prepare_distinct(path, out, embedder):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        columns = [
            'w2v_sim_mean',
            'w2v_sim_max',
            'w2v_sim_min',
            'w2v_sim_std',
            'w2v_dist_mean',
            'w2v_dist_max',
            'w2v_dist_min',
            'w2v_dist_std',
        ]
        columns = ','.join(columns)
        outfile.write(columns + '\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print 'finished', c
            q1 = remove_punctuation(str(row['question1']).lower())
            q2 = remove_punctuation(str(row['question2']).lower())
            # print q1,q2
            q1, q2 = distinct_terms(q1, q2)
            # print q1,"_______",q2
            a2 = [x for x in q1.split(' ') if x in embedder.vocab]
            b2 = [x for x in q2.split(' ') if x in embedder.vocab]
            # print a2,b2

            sims = []
            dists = []
            if len(a2) == 0 or len(b2) == 0:
                sims = [0.0]
                dists = [0.0]
            else:
                for i in range(len(a2)):
                    for j in range(len(b2)):
                        try:
                            worda = a2[i]
                            wordb = b2[j]
                            if worda == "" or wordb == "":
                                continue
                            sim = embedder.n_similarity(worda, wordb)
                            vector_diff = embedder[worda] - embedder[wordb]
                            dist = np.sqrt(np.sum(vector_diff**2))

                            sims.append(sim)
                            dists.append(dist)
                        except Exception, e:
                            # print e
                            continue
            if len(sims) == 0 or len(dists) == 0:
                sims = [0.0]
                dists = [0.0]

            w2v_sim_mean = np.mean(sims)
            w2v_sim_max = np.max(sims)
            w2v_sim_min = np.min(sims)
            w2v_sim_std = np.std(sims)

            w2v_dist_mean = np.mean(dists)
            w2v_dist_max = np.max(dists)
            w2v_dist_min = np.min(dists)
            w2v_dist_std = np.std(dists)
            features = (
                w2v_sim_mean,
                w2v_sim_max,
                w2v_sim_min,
                w2v_sim_std,
                w2v_dist_mean,
                w2v_dist_max,
                w2v_dist_min,
                w2v_dist_std,
            )
            outfile.write('%s,%s,%s,%s,%s,%s,%s,%s\n' % features)
            c += 1
        end = datetime.now()
Example #48
0
def load_label(path, label):
    result = []
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            result.append((row['Id']))
    return result
            print('day{0} completed!'.format(this))

#        print(line)
        train.write(','.join('{0}'.format(line[i]) for i in TRAIN)+'\n')
        test.write(','.join('{0}'.format(line[i]) for i in TEST)+'\n')
        validation.write('{0},{1}\n'.format(line['Id'],line['Label']))
        #t+=1


#day_split()
day_split2()

last=0
dataset=open('day1.vw')
validation=open('day1_val.csv')
for e, row in enumerate(DictReader(open(input)) ):
        categorical_features = []
        for k,v in row.items():
            if k not in ['Label','Id']:
                if len(str(v)) > 0 :
                    categorical_features.append('{0}.{1}'.format(k,v)) #这里连‘-’导致测试不通过,很奇怪,无视之

        if(type=='-train'):
            if row['Label']=='1':
                label = 1
            else:
                label = -1
        else:
            if validation.readline().strip().split(',')[1] =='1':
                label = 1
            else:
Example #50
0
# writer # DictWriter

from csv import writer

with open('file2.csv', 'w', newline='') as f:
    csv_writer = writer(f)
    # csv_writer.writerow(['name','country'])
    # csv_writer.writerow(['nateq','india'])
    # csv_writer.writerow(['ahmed','india'])
    csv_writer.writerows([['name', 'country'], ['nateq', 'india'],
                          ['ahmed', 'india']])

from csv import DictReader

with open('file2.csv', 'r') as rf:
    csv_reader = DictReader(rf)
    for row in csv_reader:
        print(row)
from csv import DictReader

train = 'data/train_df_app_smooth.csv'  # path to training file
test = 'data/test_df_app_smooth.csv'
#train = 'data/train_df_site_smooth.csv'               # path to training file
#test = 'data/test_df_site_smooth.csv'

# -- train data -- #
# list(test_df.columns.values)

start = datetime.now()
with open('data/train_df_app_smooth_ex_id.csv', "wb") as outfile:
    outfile.write(
        'id,click,hour,C1,banner_pos,app_domain,app_category,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21\n'
    )
    for t, row in enumerate(DictReader(open(train))):
        # turn hour really into hour, it was originally YYMMDDHH

        ID = row['id']
        click = row['click']
        hour = row['hour']
        C1 = row['C1']
        banner_pos = row['banner_pos']
        app_domain = row['app_domain']
        app_category = row['app_category']
        #device_id = row['device_id']
        device_ip = row['device_ip']
        device_model = row['device_model']
        device_type = row['device_type']
        device_conn_type = row['device_conn_type']
        C14 = row['C14']
Example #52
0
def filter_by_count(input_prefix, output_prefix, min_count):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    # read group
    group_max_count_fl = {}
    group_max_count_p = {}
    f = open(group_filename)
    for line in f:
        #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
        pbid, members = line.strip().split('\t')
        group_max_count_fl[pbid] = 0
        group_max_count_p[pbid] = 0
        members = members.split(',')
        for m in members:
            i = m.find('|')
            if i > 0:
                tmp = m.split('|')[1].split('/')[1]  #ex: tmp = f30p16
            else:
                tmp = m.split('/')[1]
            fl_count, p_count = tmp.split('p')
            fl_count = int(fl_count[1:])
            p_count = int(p_count)
            group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
            group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(
        lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x]
        >= min_count and group_max_count_p >= 0, d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'
Example #53
0
def read_quotes(filename):
    with open(filename, 'r', encoding='utf-16') as file:
        csv_reader = DictReader(file)
        return list(csv_reader)
Example #54
0
    def test_export(
        self,
        es_with_collector,
        request_sortby,
        orm_ordering,
        requests_mock,
        accepts_dit_email_marketing,
    ):
        """Test export of contact search results."""
        ArchivedContactFactory()
        ContactWithOwnAddressFactory()
        ContactFactory()

        # These are to test date of and team of latest interaction a bit more thoroughly
        CompanyInteractionFactory.create_batch(2)
        CompanyInteractionFactory(contacts=ContactFactory.create_batch(2))
        interaction_with_multiple_teams = CompanyInteractionFactory()
        InteractionDITParticipantFactory.create_batch(
            2,
            interaction=interaction_with_multiple_teams,
        )

        es_with_collector.flush_and_refresh()

        data = {}
        if request_sortby:
            data['sortby'] = request_sortby

        url = reverse('api-v3:search:contact-export')

        with freeze_time('2018-01-01 11:12:13'):
            response = self.api_client.post(url, data=data)

        assert response.status_code == status.HTTP_200_OK
        assert parse_header(response.get('Content-Type')) == ('text/csv', {
            'charset':
            'utf-8'
        })
        assert parse_header(response.get('Content-Disposition')) == (
            'attachment',
            {
                'filename': 'Data Hub - Contacts - 2018-01-01-11-12-13.csv'
            },
        )

        sorted_contacts = Contact.objects.annotate(
            computed_address_country_name=Coalesce(
                'address_country__name',
                'company__address_country__name',
            ), ).order_by(
                orm_ordering,
                'pk',
            )

        matcher = requests_mock.post(
            f'{settings.CONSENT_SERVICE_BASE_URL}'
            f'{CONSENT_SERVICE_PERSON_PATH_LOOKUP}',
            text=generate_hawk_response({
                'results': [{
                    'email':
                    contact.email,
                    'consents': [
                        CONSENT_SERVICE_EMAIL_CONSENT_TYPE,
                    ] if accepts_dit_email_marketing else [],
                } for contact in sorted_contacts],
            }),
            status_code=status.HTTP_200_OK,
        )

        reader = DictReader(StringIO(response.getvalue().decode('utf-8-sig')))
        assert reader.fieldnames == list(
            SearchContactExportAPIView.field_titles.values())

        expected_row_data = [{
            'Name':
            contact.name,
            'Job title':
            contact.job_title,
            'Date created':
            contact.created_on,
            'Archived':
            contact.archived,
            'Link':
            f'{settings.DATAHUB_FRONTEND_URL_PREFIXES["contact"]}/{contact.pk}',
            'Company':
            get_attr_or_none(contact, 'company.name'),
            'Company sector':
            get_attr_or_none(contact, 'company.sector.name'),
            'Company link':
            f'{settings.DATAHUB_FRONTEND_URL_PREFIXES["company"]}/{contact.company.pk}',
            'Company UK region':
            get_attr_or_none(contact, 'company.uk_region.name'),
            'Country':
            contact.company.address_country.name if
            contact.address_same_as_company else contact.address_country.name,
            'Postcode':
            contact.company.address_postcode
            if contact.address_same_as_company else contact.address_postcode,
            'Phone number':
            ' '.join(
                (contact.telephone_countrycode, contact.telephone_number)),
            'Email address':
            contact.email,
            'Accepts DIT email marketing':
            accepts_dit_email_marketing,
            'Date of latest interaction':
            max(contact.interactions.all(), key=attrgetter('date')).date
            if contact.interactions.all() else None,
            'Teams of latest interaction':
            _format_interaction_team_names(
                max(contact.interactions.all(), key=attrgetter('date')), )
            if contact.interactions.exists() else None,
            'Created by team':
            get_attr_or_none(contact, 'created_by.dit_team.name'),
        } for contact in sorted_contacts]

        actual_row_data = [dict(row) for row in reader]
        assert actual_row_data == format_csv_data(expected_row_data)
        assert matcher.call_count == 1
        assert matcher.last_request.json() == {
            'emails': [contact.email for contact in sorted_contacts],
        }
Example #55
0
def gather_experiment_data(experiment, lead_time_interval=None):
    experiment_data = dict()

    metadata_path = experiment / 'metadata.json'
    if metadata_path.exists():
        with open(metadata_path) as fp:
            metadata = json.load(fp)
        site_dataset_path = Path(metadata['experiment_config']['site_dataset_path'])
        site_id = get_site_id(site_dataset_path)
        nwp_model = get_nwp_model_from_path(site_dataset_path)
        experiment_data['site_id'] = site_id
        experiment_data['nwp_model'] = nwp_model.identifier
        try:
            experiment_data['model'] = metadata['model_metadata']['model']
            model_kwargs = metadata['model_metadata']['kwargs']
            for kwarg, value in model_kwargs.items():
                experiment_data[kwarg] = str(value)
        except KeyError:
            pass
        try:
            model_config = metadata['hp_settings']['model_config']
            experiment_data['model'] = model_config['model']
            model_kwargs = model_config['model_kwargs']
            for kwarg, value in model_kwargs.items():
                experiment_data[f'model_kwarg_{kwarg}'] = str(value)
        except KeyError:
            pass
    best_model_path = experiment / 'best_model'
    if best_model_path.exists():
        try:
            best_model = load_model(best_model_path)
            if isinstance(best_model, windpower.mltrain.train.BaseModel):
                model_metadata = best_model.get_metadata()
                for k, v in model_metadata.items():
                    if k == 'args':
                        for i, arg in enumerate(v):
                            experiment_data[f'args_{i}'] = arg
                    elif k == 'kwargs':
                        for kwarg_name, kwarg in v.items():
                            experiment_data[f'kwarg_{kwarg_name}'] = kwarg
                    else:
                        experiment_data[k] = v
            try:
                best_iteration = best_model.best_iteration_
                experiment_data['best_iteration'] = best_iteration
            except AttributeError:
                pass
        except ValueError:
            pass

    ## best_performance.csv is the best performance on the validation set, if there's a test_predictions.npz, we should use it
    test_predictions_path = experiment / 'test_predictions.npz'
    if test_predictions_path.exists():
        print("Using test predictions to derive performance data")
        predictions = np.load(test_predictions_path)

        x = predictions['x']
        with open(experiment / 'artifacts' / 'settings.pkl', 'rb') as fp:
            settings = pickle.load(fp)
        model_variables_path = experiment / 'test_variable_definitions.json'
        with open(model_variables_path) as fp:
            model_variables = json.load(fp)

        lead_time_column = model_variables['lead_time'][0]
        time_of_day_column = model_variables['time_of_day'][0]
        if time_of_day_column > x.shape[1]:
            print("Warning, time of day column is higher than number of columns")
            time_of_day_column = lead_time_column + 1

        y = predictions['y']
        y_hat = predictions['y_hat']
        mae = np.abs(y - y_hat)

        production_offset = settings.dataset_config.production_offset
        time_of_day = x[:, time_of_day_column]
        lead_times = x[:, lead_time_column] + production_offset
        for lead_time in np.unique(lead_times.astype(np.int)):
            experiment_data[f'mae_{lead_time:02}'] = mae[lead_times == lead_time].mean()

        experiment_data['mae'] = mae.mean()
    else:
        best_performance_path = experiment / 'best_performance.csv'
        if best_performance_path.exists():
            with open(best_performance_path) as in_fp:
                best_performance = next(
                    iter(DictReader(in_fp)))  # take the first row, it's the only one
                for k, v in best_performance.items():
                    experiment_data[k] = v
        else:
            raise FileNotFoundError("No performance data found")

    fold_reference_times_path = experiment.parent / 'fold_reference_times.npz'
    if fold_reference_times_path.exists():
        fold_reference_times = np.load(fold_reference_times_path)
        training_reference_times = fold_reference_times['train']
        test_reference_times = fold_reference_times['test']
        experiment_data['n_train_forecasts'] = len(training_reference_times)
        experiment_data['n_test_forecasts'] = len(test_reference_times)

    return experiment_data
Example #56
0
    def __init__(self, env_params, sim_params, network, simulator='traci'):
        """See parent class."""
        for p in OPEN_ENV_PARAMS.keys():
            if p not in env_params.additional_params:
                raise KeyError('Env parameter "{}" not supplied'.format(p))

        assert not (env_params.additional_params["warmup_path"] is not None
                    and env_params.additional_params["inflows"] is not None), \
            "Cannot assign a value to both \"warmup_paths\" and \"inflows\""

        # this is stored to be reused during the reset procedure
        self._network_cls = network.__class__
        self._network_name = deepcopy(network.orig_name)
        self._network_net_params = deepcopy(network.net_params)
        self._network_initial_config = deepcopy(network.initial_config)
        self._network_traffic_lights = deepcopy(network.traffic_lights)
        self._network_vehicles = deepcopy(network.vehicles)

        super(AVOpenEnv, self).__init__(
            env_params=env_params,
            sim_params=sim_params,
            network=network,
            simulator=simulator,
        )

        # Get the paths to all the initial state xml files
        warmup_path = env_params.additional_params["warmup_path"]
        if warmup_path is not None:
            self.warmup_paths = [
                f for f in os.listdir(warmup_path) if f.endswith(".xml")
            ]
            self.warmup_description = defaultdict(list)
            for record in DictReader(
                    open(os.path.join(warmup_path, 'description.csv'))):
                for key, val in record.items():  # or iteritems in Python 2
                    self.warmup_description[key].append(float(val))
        else:
            self.warmup_paths = None
            self.warmup_description = None

        # maximum number of controlled vehicles
        self.num_rl = env_params.additional_params["num_rl"]

        # queue of rl vehicles waiting to be controlled
        self.rl_queue = collections.deque()

        # names of the rl vehicles controlled at any step
        self.rl_veh = []

        # names of the rl vehicles past the control range
        self.removed_veh = []

        # used for visualization: the vehicles behind and after RL vehicles
        # (ie the observed vehicles) will have a different color
        self.leader = []
        self.follower = []

        # control range, updated to be entire network if not specified
        self._control_range = \
            self.env_params.additional_params["control_range"] or \
            [0, self.k.network.length()]

        # dynamics controller for uncontrolled RL vehicles (mimics humans)
        controller = self.k.vehicle.type_parameters["human"][
            "acceleration_controller"]
        self._rl_controller = controller[0](
            veh_id="rl",
            car_following_params=self.k.vehicle.type_parameters["human"][
                "car_following_params"],
            **controller[1]
        )

        if isinstance(network, I210SubNetwork):
            # the name of the final edge, whose speed limit may be updated
            self._final_edge = "119257908#3"
            # maximum number of lanes to add vehicles across
            self._num_lanes = 5
        else:
            # the name of the final edge, whose speed limit may be updated
            self._final_edge = "highway_end"
            # maximum number of lanes to add vehicles across
            self._num_lanes = 1
Example #57
0
def chain_samples(dirs,
                  names,
                  group_filename,
                  gff_filename,
                  count_filename,
                  field_to_use='count_fl',
                  fuzzy_junction=0,
                  allow_5merge=False,
                  max_3_diff=100,
                  fastq_filename=None):
    for d in dirs.values():
        sample_sanity_check(os.path.join(d, group_filename),\
                            os.path.join(d, gff_filename),\
                            os.path.join(d, count_filename),\
                            os.path.join(d, fastq_filename) if fastq_filename is not None else None)

    count_header, count_info = read_count_info(count_filename, dirs,
                                               field_to_use)

    # some names may already start with "tmp_" which means they are intermediate results that have already been chained
    # find the first non "tmp_" and start from there
    if names[0].startswith('tmp_'):
        chain = []
        for start_i, name in enumerate(names):
            if name.startswith('tmp_'):
                chain.append(name[4:])
            else:
                break
        # start_i, name now points at the first "non-tmp" sample
        # we want to go to the last tmp_ sample and read it
        name = names[start_i -
                     1][4:]  # this is the last tmp_ sample, let's read it
        o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \
                        internal_fuzzy_max_dist=fuzzy_junction, \
                        allow_5merge=allow_5merge, \
                        max_3_diff=max_3_diff, \
                        fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None)
        #chain.append(name) # no need, already done above
    else:  # everything is new, start fresh
        name = names[0]
        d = dirs[name]
        chain = [name]
        o = sp.MegaPBTree(os.path.join(d, gff_filename), os.path.join(d, group_filename), \
                        self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \
                        allow_5merge=allow_5merge, \
                        max_3_diff=max_3_diff, \
                        fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None)
        start_i = 1

    for name in names[start_i:]:
        assert not name.startswith('tmp_')
        d = dirs[name]
        o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \
                     sample_prefix=name, output_prefix='tmp_'+name, \
                     fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None)
        o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \
                          internal_fuzzy_max_dist=fuzzy_junction, \
                          allow_5merge=allow_5merge, \
                          max_3_diff=max_3_diff, \
                          fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None)
        chain.append(name)

    # now recursively chain back by looking at mega_info.txt!!!
    d = {}  # ex: (tmp_1009, PB.1.1) --> mega info dict
    for c in chain[1:]:
        for r in DictReader(open('tmp_' + c + '.mega_info.txt'),
                            delimiter='\t'):
            d['tmp_' + c, r['superPBID']] = r

    f1 = open('all_samples.chained_ids.txt', 'w')
    writer1 = DictWriter(f1, fieldnames=['superPBID'] + chain, delimiter='\t')
    writer1.writeheader()
    f2 = open('all_samples.chained_count.txt', 'w')
    writer2 = DictWriter(f2, fieldnames=['superPBID'] + chain, delimiter='\t')
    writer2.writeheader()

    reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'),
                        delimiter='\t')
    for r in reader:
        saw_NA = False
        r0 = r
        answer = defaultdict(lambda: 'NA')  # ex: 1009 --> PB.1.1
        answer2 = defaultdict(lambda: 'NA')  # ex: 1009 --> count
        answer[chain[-1]] = r[chain[-1]]
        if r[chain[-1]] != 'NA':
            answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]]
        for c in chain[::-1][
                1:
                -1]:  # the first sample does not have tmp_, because it's not a chain
            if r['tmp_' + c] == 'NA':
                saw_NA = True
                break
            else:
                r2 = d['tmp_' + c, r['tmp_' + c]]
                answer[c] = r2[c]
                if answer[c] != 'NA':
                    answer2[c] = count_info[c, answer[c]]
                r = r2
        if not saw_NA:
            answer[chain[0]] = r[chain[0]]
            if answer[chain[0]] != 'NA':
                answer2[chain[0]] = count_info[chain[0], answer[chain[0]]]

        rec1 = {'superPBID': r0['superPBID']}
        rec2 = {'superPBID': r0['superPBID']}
        for c in chain:
            rec1[c] = answer[c]
            rec2[c] = str(answer2[c])
        writer1.writerow(rec1)
        writer2.writerow(rec2)
    f1.close()
    f2.close()

    shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff')
    if fastq_filename is not None:
        shutil.copyfile('tmp_' + chain[-1] + '.rep.fq',
                        'all_samples.chained.rep.fq')

    print("Chained output written to:", file=sys.stdout)
    print("all_samples.chained.gff", file=sys.stdout)
    print(f1.name, file=sys.stdout)
    print(f2.name, file=sys.stdout)
    if fastq_filename is not None:
        print("all_samples.chained.rep.fq", file=sys.stdout)
Example #58
0
from csv import DictWriter, DictReader

with open('test_csv', 'w') as f:
    csv_writer = DictWriter(f, fieldnames=['name', 'contact'])
    csv_writer.writerows([{
        'name': 'omkar',
        'contact': '123'
    }, {
        'name': 'surve',
        'contact': '456'
    }])

with open('test_csv', 'r') as t:
    csv_reader = DictReader(t, fieldnames=['name', 'contact'])
    for k in csv_reader:
        print(k['contact'])
Example #59
0
plt.show()  # This needs to be commented out of user code
'''
import sys

def report( name, shortd, longd):
	d = {'Name': name, 'Short': shortd, 'Long': longd}
	print(str(d))

#Mock data goes first

from csv import DictReader # helps with handling csv formatted data
from urllib2 import urlopen # helps with pulling data off the web
url = 'https://docs.google.com/spreadsheets/d/1_artlzgoj6pDBCBfdt9-Jmc9RT9yLsZ0vTnk3zJmt_E/pub?gid=1291197392&single=true&output=csv'
response = urlopen(url)
loan_table = [row for row in DictReader(response)]  # a mapping function using identity

xloan_table = loan_table  # in case user screws with loan_table

int_scores = [int(row['Credit_History']) for row in xloan_table if row['Credit_History'] != '']

# image dirs

target_url1 = '/home/smccumsey/waggle-classroom/waggle/media/course_1/module_9/image_3/challenge3.png'
target_url2 = '/home/smccumsey/waggle-classroom/waggle/media/tmp/attempt_6_3.png'

import matplotlib.pyplot as plt

plt.close()
plt.cla()
plt.clf()
Example #60
0
            if len(existing_password) < 8:
                print("Password too short")
                continue

            elif len(existing_password) > 35:
                print("Password too long")
                continue

            elif not existing_password.isalnum():
                print("Password must be alphanumerical")
                continue
            # check if password/account exist in the system

            else:
                with open("user_data.csv", "r") as read_obj:
                    csv_reader = DictReader(read_obj)
                    for row in csv_reader:
                        if row["username"] == existing_user_id and row[
                                "password"] == existing_password:
                            print("Valid password")
                            print("Welcome back %s, it's been a while" %
                                  existing_user_id)
                            break
                        else:
                            print(
                                "Password or username is incorrect try again")
                break
        break

    # if user does not have an account they should create an account by giving their name, email, username and password.  user must receive a greeting