def guess_dialect(self, sample): sniffer = Sniffer() try: dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" has_header = False # lets just guess the value s = sample.split("\n")[ 1] # we dont take header (there is no empty column for sure) delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that ressembles delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True return dialect, has_header
def create_reader(self, csvfile): """ Summary: Validates a csv file, returns a DictReader object. Description: Takes one argument: "data" (Should be a csv file) """ # Determines the dialect of the csv file for processing file_dialect = Sniffer().sniff(csvfile.read(1024)) # Resets the read/write pointer within the file csvfile.seek(0) # Checks to see that the csv file imported has a header row, # that will be used for later parsing. if not Sniffer().has_header(csvfile.read(1024)): print('Imported csv file lacks header row') exit() # Resets the read/write pointer within the file csvfile.seek(0) # Creates a DictReader object with the csvfile provided, and the # dialect object to define the parameters of the reader instance. reader = DictReader(csvfile, dialect=file_dialect) # Return DictReader object return reader
def guess_dialect(self, sample): sniffer = Sniffer() try: dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) except Error: # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone" if sample.strip() == "": print("The file seems empty") quit() has_header = False # lets just guess the value try: s = sample.split("\n")[1] # we dont take header (there is no empty column for sure) except IndexError: # there is a single line in the file s = sample.split("\n")[0] delimiter = "" for dl in (",", ";", "|"): # lets suppose the doubled sign is delimiter if s.find(dl + dl) > -1: delimiter = dl break if not delimiter: # try find anything that ressembles delimiter for dl in (",", ";", "|"): if s.find(dl) > -1: delimiter = dl break dialect = csv.unix_dialect dialect.delimiter = delimiter if not dialect.escapechar: dialect.escapechar = '\\' # dialect.quoting = 3 dialect.doublequote = True return dialect, has_header
def __init__(self, inFile): from csv import Sniffer, reader csvFile = open(inFile, 'r') sample = csvFile.read(1024) csvFile.seek(0) self.reader = reader(csvFile, Sniffer.sniff(sample)) if Sniffer.has_header(sample): self.varNames = next(self.reader) else: self.varNames = None
def guess_dialect(f): # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**> # <https://docs.python.org/2/library/csv.html#csv.Sniffer> s = Sniffer() try: retval = s.sniff(f.read(1024), [',', '\t', ]) # 1024 taken from the Python docs except CSVError: retval = 'excel' finally: f.seek(0) # The above f.read moves the file-cursor in the CSV file. return retval
def converter(): if not session['extension'] or not session['content_type']: flash('Unsupported file type', 'info') return redirect(url_for('.dropzone')) session['processed'] = False session['outputs'] = mkdtemp() is_csv = (session['extension'] == '.csv' or session['content_type'].startswith('text/csv')) is_excel = session['extension'] in ['.xls', '.xlsx'] or any( s in session['content_type'] for s in ['spreadsheet', 'xls', 'xlsx', 'excel']) is_text = (session['extension'] == '.txt' or session['content_type'].startswith('text/')) if is_csv: file_name = listdir(session['tmp_dir'])[0] file_path = join(session['tmp_dir'], file_name) # guess file encoding encoding = get_encoding(file_path) # guess separator with open(file_path, encoding=encoding) as f: sniffer = Sniffer() line = f.readline().encode(encoding).decode('utf-8') dialect = sniffer.sniff(line) df = pd.read_csv(file_path, encoding=encoding, dialect=dialect) session['fields'] = df.columns.tolist() elif is_excel: file_name = listdir(session['tmp_dir'])[0] file_path = join(session['tmp_dir'], file_name) df = pd.read_excel(file_path, encoding='utf-8') session['fields'] = df.columns.tolist() elif is_text: session['fields'] = ['id', 'text'] dataset_json = texts_to_json(session['tmp_dir']) df = DataFrame(dataset_json) else: flash('Uploaded file types have not been recognized') return redirect(url_for('.dropzone')) df.to_csv(join(session['outputs'], 'original.csv'), index=False, encoding='utf-8') return redirect(url_for('.field_selection_get'))
def guess_dialect(f): # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**> # <https://docs.python.org/2/library/csv.html#csv.Sniffer> s = Sniffer() try: retval = s.sniff(f.read(1024), [ ',', '\t', ]) # 1024 taken from the Python docs except CSVError: retval = 'excel' finally: f.seek( 0) # The above f.read moves the file-cursor in the CSV file. return retval
def analyze_csv(file): try: dialect = Sniffer().sniff(''.join(read_lines_file(file, config.num_lines_to_check_csv_dialect))) log.info(f"delimiter: ({dialect.delimiter}) doublequote: ({dialect.doublequote}) " f"escapechar: ({dialect.escapechar}) " f"lineterminator: ({dialect.lineterminator}) quotechar: ({dialect.quotechar}) " f"quoting: ({dialect.quoting}) " f"skipinitialspace: ({dialect.skipinitialspace})") except _csv.Error: log.info("Não foi possível determinar o delimitador.") return () try: file.seek(0) csv_file = reader(read_lines_file(file, config.num_lines_to_check_type_of_place), dialect, quoting=QUOTE_ALL) csv_file = list(csv_file) except _csv.Error as err: log.info("<><><><><><><><><><><><><><><>") log.info(err) return () # Verifica qual o provável tamanho de cada linha do csv try: len_row = mode([len(x) for x in csv_file[0:]]) log.info(f"tamanho provável da linha: {len_row}") except StatisticsError: log.info('Problema ao verifica qual o provável tamanho de cada linha do csv') return () time_i = time() types_and_indexes_ = types_and_indexes(csv_file, driver) time_f = time() log.info(f"Tempo para verificação de tipos: {time_f - time_i}") if not types_and_indexes_[0]: return () file.seek(0) return dialect, len_row, types_and_indexes_
def read_certified_applications(csv_file_path): """ Generator function for reading the input CSV files. This function does the following: 1. We sniff the file format so we can support both default excel formatted csv and the semi-colon separated files (see Test 1 and Test 2). 2. We check if the file uses the previous LCA record layout. If so, we migrate it to the current H1B Record Layout. 3. Filter out all application that are not 'CERTIFIED'. The goal of this function is to separate code for reading the input file format from our analytical code. :param csv_file_path: :return: """ print('Processing input file: {0}'.format(csv_file_path)) with open(csv_file_path) as csv_file: # Sniff the file format dialect = Sniffer().sniff(csv_file.read(16384)) csv_file.seek(0) reader = DictReader(csv_file, dialect=dialect) # Dirty migration of the previous LCA Record Layout to the current H1B Record Layout if 'LCA_CASE_NUMBER' in reader.fieldnames: reader.fieldnames = get_migrated_fieldnames() # Filter all applications that are not certified yield from filterfalse(lambda row: row['CASE_STATUS'] != 'CERTIFIED', reader)
def sniff(file): # The pandas default behavior is to look at filename extensions, # but we decided we can't rely on those to be accurate. compression = { b'\x1f\x8b': 'gzip' }.get(file.read(2)) file.seek(0) encoding = 'latin-1' if compression: peek_window = 1024 # arbitrary if compression == 'gzip': first_bytes = gzip.open(file).peek(peek_window) else: raise Exception( 'Unsupported compression type: {}'.format(compression)) first_characters = first_bytes.decode(encoding) else: first_characters = file.readline().decode(encoding) is_gct = first_characters.startswith('#1.2') dialect = excel_tab if is_gct else Sniffer().sniff(first_characters) if search(r'\W', first_characters.split('\n')[0]): is_list = False as_list = None else: # No non-word characters in first line file.seek(0) is_list = True as_list = DataFrame({'item': [bytes.decode(encoding) for bytes in file.read().splitlines()]}) file.seek(0) return SniffResult(compression=compression, is_gct=is_gct, dialect=dialect, is_list=is_list, as_list=as_list)
def load_languages( path: FileSystemPath, dialect: Optional[DialectLike] = None, prefix: str = "", ) -> Dict[str, Language]: """Return a dictionnary mapping each column to a language file.""" with open(path, newline="") as csv_file: if not dialect: dialect = Sniffer().sniff(csv_file.read(1024)) csv_file.seek(0) reader = DictReader(csv_file, dialect=dialect) key, *language_codes = reader.fieldnames or [""] languages = {code: Language({}) for code in language_codes} for row in reader: if not (identifier := row[key]): continue identifier = prefix + identifier for code in language_codes: if value := row[code]: languages[code].data[identifier] = value else: msg = f"Locale {code!r} has no translation for {identifier!r}" logger.warning(msg)
def processCSVMatrix(file): with open(file, 'r') as csvfile: dialect = Sniffer().sniff(csvfile.readline()) df = DataFrame() for chunk in read_csv(file, sep=dialect.delimiter, mangle_dupe_cols=True, index_col=False, chunksize=1000): df = concat([df, chunk], ignore_index=True) nodes = df.columns.values.tolist() nodes.pop(0) df["Unnamed: 0"] = nodes df = df.rename(columns={'Unnamed: 0': 'name'}) df = df.set_index(keys='name') # Remove underscores in names names = df.columns.tolist() names = [name.replace('_', ' ') for name in names] df.columns = names df.set_index([df.columns], inplace=True) return df
def __init__(self, filepath: str) -> None: ''' Opens the data file and loads its content. Args: filepath: str The path to the data file that contains keywords associated with the collected jobs descriptions. Raises: FileNotFoundError: filepath is not correct or related file cannot be found. PermissionError: read access to the specified file cannot be granted ''' with open(filepath, 'r') as csv_f: # automatically detects separators and internal format of the CSV file csv_dialect = Sniffer().sniff(csv_f.read(1024)) # rewinds it csv_f.seek(0) # and reads its content self.entries = [row for row in reader(csv_f, csv_dialect)]
def _detect_dialect(cls, csv_file: TextIO, dialect: Dialect) -> Dialect: number_of_rows = 1024 start_of_file = 0 if not dialect: dialect = Sniffer().sniff(csv_file.read(number_of_rows)) csv_file.seek(start_of_file) return dialect
def __init__(self, source, schema, **kargs): super().__init__(source, **kargs) self.__source = source self.__filename = os.path.basename(source) self.__headers_4_mipmap = OrderedDict() # if csv file get headers (tabulator aka csv file) if not self._Table__storage: # used encoding utf-8-sig to remove the byte order mask (BOM) with open(source, 'r', encoding='utf-8-sig') as csv_file: # find the dialect of the csv file try: dialect = Sniffer().sniff(csv_file.read(1024)) except: dialect = 'excel' # reset the seeker to the start of the file csv_file.seek(0) reader = DictReader(csv_file, dialect=dialect) self.__actual_headers = reader.fieldnames else: self.__actual_headers = None if self.__actual_headers: self.__create_headers_4_mipmap() # QcSchema if isinstance(schema, QcSchema): self._Table__schema = schema self.__metadata = True elif isinstance(schema, dict): self._Table__schema = QcSchema(schema) self.__metadata = True else: self.__metadata = False
def trainiere(): #Sammle Daten aus dem POST maxBlattPunkte = request.form["maxBlattPunkte"] blattNamen = request.form["blattNamen"] maxKlausurPunkte = request.form.get("maxKlausurPunkte") klausurPunkteName = request.form.get("klausurPunkteName") note = request.form.get("note") datei = request.files["daten"] inhalt = datei.read() sep = Sniffer().sniff(str(inhalt)[:100]).delimiter df = pd.read_csv(StringIO(str(inhalt, "utf-8")), sep=sep) df.replace(",", ".", regex=True, inplace=True) try: #Daten werden den ursprünglichen angegleicht und wichtige Features werden gespeichtert df = bereiteTraining(df, maxBlattPunkte.split(","), blattNamen, maxKlausurPunkte, klausurPunkteName, note) df.to_csv("Server/data/Training/" + datei.filename, index=False) #Alle Schätzer werden mit den geänderten Daten neu trainiert schaetzer.trainiere() flash("Upload erfolgreich!", "erfolg") except Exception as e: flash("Upload leider nicht erfolgreich: " + str(e), "error") return redirect( url_for("startseite", maxBlattPunkte=maxBlattPunkte, blattNamen=blattNamen, maxKlausurPunkte=maxKlausurPunkte, klausurPunkteName=klausurPunkteName, note=note)) return redirect(url_for("startseite"))
def __init__(self, fname, encoding, verbose=False): self._verbose = verbose # List of dicts self.data = [] # Set up CSV reader fh = open(fname, 'rb') try: dialect = Sniffer().sniff(fh.read(1024)) fh.seek(0) csv = reader(fh, dialect=dialect) except Error: fh.seek(0) csv = reader(fh, delimiter="\t") # Read labels labels = [v.strip() for v in csv.next()] self._fieldcount = len(labels) # Read values for line in csv: try: if line[0].strip()[0] == '#': # skip comment lines continue except IndexError: pass d = {} for i, v in enumerate(line): d[labels[i]] = v.decode(encoding).strip() self.data.append(d)
def iter_dataset(self, options): fp = options.get("dataset_file") filetype = options["filetype"] if filetype == "mongo": assert options["mongo_collection"] db = self.get_mongo_db() coll = db[options["mongo_collection"]] for l in coll.find(): del l["_id"] yield l elif filetype == "json": for l in json.load(fp): yield l elif filetype == "jsonlines": for l in fp: if l: yield json.loads(l) elif filetype == "csv": if self.csv_dialect is None: dialect = Sniffer().sniff(fp.read(1024 * 16)) fp.seek(0) else: dialect = self.csv_dialect r = DictReader(fp, dialect=dialect) for l in r: yield l else: raise NotImplementedError()
def get_projects(brigade, projects_list_url): ''' Get a list of projects from CSV, TSV, or JSON. Convert to a dict. TODO: Have this work for GDocs. ''' print 'Asking for', projects_list_url got = get(projects_list_url) # If projects_list_url is a json file try: projects = [ dict(brigade=brigade, code_url=item) for item in got.json() ] # If projects_list_url is a type of csv except ValueError: data = got.text.splitlines() dialect = Sniffer().sniff(data[0]) projects = list(DictReader(data, dialect=dialect)) for project in projects: project['brigade'] = brigade map(update_project_info, projects) return projects
def getDelimiter(path): sniffer = Sniffer() with open(path, 'r') as rfile: header = rfile.readline() sample = header + rfile.readline() + rfile.readline() try: asniff = sniffer.sniff(sample, delimiters=";, ") except Exception: class tsniff(object): lineterminator = "\n" delimiter = "," asniff = tsniff() asniff.lineterminator = "\n" return asniff.delimiter, sniffer.has_header(sample)
def __download_as_pandas(self, chunksize, sniff_ahead=2**20): """Download and parse data from URL as a table""" with self.__tempfile() as tempfile: self.url = self.__copyfileobj(tempfile) with open(tempfile, mode="rb") as handle: magic = handle.read(3) if magic == b"\x1f\x8b\x08": compression = "gzip" from gzip import open as _open elif magic == b"\x42\x5a\x68": compression = "bz2" from bz2 import open as _open else: compression, _open = "infer", open try: with _open(tempfile, mode="rt", newline="") as handle: sep = Sniffer().sniff(handle.read(sniff_ahead)).delimiter _reader_kw = dict( sep=sep, compression=compression, chunksize=chunksize, **self.pandas_kws, ) for i, csv_chunk in enumerate(read_csv(tempfile, **_reader_kw)): self.INPLACE_process(csv_chunk) msg = f"interpreted table chunk {i}:\n {tempfile}" GeneFabLogger.info(f"{self.name}; {msg}") yield csv_chunk except (IOError, UnicodeDecodeError, CSVError, PandasParserError): msg = "Not recognized as a table file" raise GeneFabFileException(msg, name=self.name, url=self.url)
def read(file): with open(file, "r") as csv: sniffer = Sniffer() sample = csv.read(4096) dialect = sniffer.sniff(sample, delimiters=[';', ',']) csv.seek(0) lines_reader = DictReader(csv, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines, lines_reader.fieldnames
def has_csv_header(self): """ Checks if csv file has header """ self.file.seek(0) has_header = Sniffer().has_header(next(iter(self))) self.file.seek(0) return has_header
def get_dialect(self, *args, lines=5, **kwargs): """ Wrap csv.Sniffer.sniff to handle comments """ self.file.seek(0) dialect = Sniffer().sniff( [line for line, _ in zip(self, range(lines))], *args, **kwargs) self.file.seek(0) return dialect
def test_CSVLogger(tmpdir): np.random.seed(1337) filepath = str(tmpdir / 'log.tsv') sep = '\t' (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples, num_test=test_samples, input_shape=(input_dim,), classification=True, num_classes=num_classes) y_test = np_utils.to_categorical(y_test) y_train = np_utils.to_categorical(y_train) def make_model(): np.random.seed(1337) model = Sequential() model.add(Dense(num_hidden, input_dim=input_dim, activation='relu')) model.add(Dense(num_classes, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=0.1), metrics=['accuracy']) return model # case 1, create new file with defined separator model = make_model() cbks = [callbacks.CSVLogger(filepath, separator=sep)] model.fit(X_train, y_train, batch_size=batch_size, validation_data=(X_test, y_test), callbacks=cbks, epochs=1) assert os.path.isfile(filepath) with open(filepath) as csvfile: dialect = Sniffer().sniff(csvfile.read()) assert dialect.delimiter == sep del model del cbks # case 2, append data to existing file, skip header model = make_model() cbks = [callbacks.CSVLogger(filepath, separator=sep, append=True)] model.fit(X_train, y_train, batch_size=batch_size, validation_data=(X_test, y_test), callbacks=cbks, epochs=1) # case 3, reuse of CSVLogger object model.fit(X_train, y_train, batch_size=batch_size, validation_data=(X_test, y_test), callbacks=cbks, epochs=2) import re with open(filepath) as csvfile: list_lines = csvfile.readlines() for line in list_lines: assert line.count(sep) == 4 assert len(list_lines) == 5 output = " ".join(list_lines) assert len(re.findall('epoch', output)) == 1 os.remove(filepath) assert not tmpdir.listdir()
def load(self, filename): assert filename is not None, "No datafile for %s!" % self.label with filename.open(encoding="utf-8-sig") as handle: dialect = Sniffer().sniff(handle.readline(), [',', ';']) dialect.doublequote = True handle.seek(0) with csvw.UnicodeDictReader(handle, dialect=dialect) as reader: for i, row in enumerate(reader, 2): # 2 as row 1 is consumed for header # standardise NA values. try: row = {k: self.fix_value(v) for (k, v) in row.items()} except: print("ERROR PARSING %s" % filename) print("ROW %d: %r" % (i, row)) raise if row['word'] == '': # skip empty rows. continue # merge notes column into comment try: note = row.pop('notes') except KeyError: note = "" if note: if row.get("comment", "") == "": row['comment'] = note else: print('NOTE', filename, row['comment'], note) raise ValueError("Fix %s:%d manually" % (self.label, i)) # lose translation try: trans = row.pop('translation') except KeyError: trans = "" if trans: print("Value %s in `translation` will be ignored." % trans) raise ValueError("Fix %s:%d manually" % (self.label, i)) yield(row)
def load_csv(path: str = 'employees.csv') -> List[List]: try: with open(path) as f: dialect = Sniffer().sniff(f.read(1024)) f.seek(0) rdr = reader(f, dialect) return list(rdr) except FileNotFoundError: warn(f'File not found. Please, make sure to place {path} file in the directory!') exit(1)
def p_csv(dialect: Optional[str], padding: bool) -> int: data = stdin.read() joe_biden = Sniffer() has_header = joe_biden.has_header(data) try: if not has_header: print(data, end="") return 0 else: d = dialect or joe_biden.sniff(data) r = _read(data, dialect=d, padding=padding) w = writer(stdout, dialect=d) w.writerows(r) except CSVErr as e: log.critical("%s", f"{ERROR}{linesep}{e}") return 1 else: return 0
def simulation_matrix(sim_file): if not Path(sim_file).exists(): raise ValueError("Invalid simulation-matrix") with open(sim_file) as csvfile: dialect = Sniffer().sniff(csvfile.read(4096)) csvfile.seek(0) reader = DictReader(csvfile, dialect=dialect) for row in reader: yield row
def add(self, pattern): # pylint: disable=missing-docstring for csv_file in [Path(p).resolve() for p in glob(pattern)]: with csv_file.open("r", encoding=self._encoding) as fread: sample = fread.readline() fread.seek(0) if sample: dialect = Sniffer().sniff(sample) self._entries += DictReader(fread, fieldnames=self._field_names, dialect=dialect) self._entries.sort(key=lambda dictionary: int(dictionary["#"]))
def _read_file_safe( self, some_file: File, ignore_headers: bool ) -> Generator[Either[Message, SomeModel], None, None]: sniffer = Sniffer() try: with some_file as csv: dialect = sniffer.sniff(csv.read(1024)) csv.seek(0) reader = DictReader(f=csv, fieldnames=self._fields, dialect=dialect) yield from self._read(reader, ignore_headers) except Exception as e: message = Message(category=MessageCategory.ERROR, key='import_csv_generic_error', args=[e]) yield Left([message])
def read(filename): with open(filename, "r") as csvfile: sniffer = Sniffer() sample = csvfile.read(4096) dialect = sniffer.sniff(sample, delimiters=[';', ',']) if sniffer.has_header(sample): # file has header pass csvfile.seek(0) lines_reader = DictReader(csvfile, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines, lines_reader.fieldnames
def add(self, pattern): for csv_file in [abspath(p) for p in glob(pattern)]: with open(csv_file, "r") as f: sample = f.readline() f.seek(0) if len(sample) > 0: dialect = Sniffer().sniff(sample) self._entries += DictReader(f, fieldnames=self._field_names, dialect=dialect) self._entries.sort(key=lambda dictionary: int(dictionary['#']))
def read(filename): with open(filename, "r") as csvfile: sniffer = Sniffer() sample = csvfile.read(1024) dialect = sniffer.sniff(sample, delimiters=[';', ',']) if sniffer.has_header(sample): # file has header pass csvfile.seek(0) lines_reader = DictReader(csvfile, dialect=dialect) lines = [] for line in lines_reader: lines.append(line) return lines
def get_projects(organization): ''' Get a list of projects from CSV, TSV, JSON, or Github URL. Convert to a dict. TODO: Have this work for GDocs. ''' _, host, path, _, _, _ = urlparse(organization.projects_list_url) matched = match(r'(/orgs)?/(?P<name>[^/]+)/?$', path) if host in ('www.github.com', 'github.com') and matched: projects_url = 'https://api.github.com/users/%s/repos' % matched.group('name') else: projects_url = organization.projects_list_url logging.info('Asking for ' + projects_url) response = get(projects_url) try: data = get_adjoined_json_lists(response) except ValueError: # If projects_list_url is a type of csv data = response.text.splitlines() dialect = Sniffer().sniff(data[0]) # # Google Docs CSV output uses double quotes instead of an escape char, # but there's not typically a way to know that just from the dialect # sniffer. If we see a comma delimiter and no escapechar, then set # doublequote to True so that GDocs output doesn't barf. # if dialect.delimiter == ',' and dialect.doublequote is False and dialect.escapechar is None: dialect.doublequote = True projects = list(DictReader(data, dialect=dialect)) for project in projects: project['organization_name'] = organization.name else: # If projects_list_url is a json file if len(data) and type(data[0]) in (str, unicode): # Likely that the JSON data is a simple list of strings projects = [dict(organization_name=organization.name, code_url=item) for item in data] elif len(data) and type(data[0]) is dict: # Map data to name, description, link_url, code_url (skip type, categories) projects = [dict(name=p['name'], description=p['description'], link_url=p['homepage'], code_url=p['html_url'], organization_name=organization.name) for p in data] elif len(data): raise Exception('Unknown type for first project: "%s"' % repr(type(data[0]))) else: projects = [] map(update_project_info, projects) return projects
def get_projects(organization): ''' Get a list of projects from CSV, TSV, JSON, or Github URL. Convert to a dict. TODO: Have this work for GDocs. ''' _, host, path, _, _, _ = urlparse(organization.projects_list_url) matched = match(r'(/orgs)?/(?P<name>[^/]+)/?$', path) if host in ('www.github.com', 'github.com') and matched: projects_url = 'https://api.github.com/users/%s/repos' % matched.group('name') response = get_github_api(projects_url) else: projects_url = organization.projects_list_url logging.info('Asking for ' + projects_url) response = get(projects_url) try: data = get_adjoined_json_lists(response) except ValueError: # If projects_list_url is a type of csv data = response.content.splitlines() try: dialect = Sniffer().sniff(response.content) # # Google Docs CSV output uses double quotes instead of an escape char, # but there's not typically a way to know that just from the dialect # sniffer. If we see a comma delimiter and no escapechar, then set # doublequote to True so that GDocs output doesn't barf. # # Code for Philly's CSV is confusing the sniffer. I suspect its the # fields with quoted empty strings. # "OpenPhillyGlobe","\"Google Earth for Philadelphia\" with open source # and open transit data." ","http://cesium.agi.com/OpenPhillyGlobe/", # "https://github.com/AnalyticalGraphicsInc/OpenPhillyGlobe","","" # if '\\' in response.content: dialect.escapechar = '\\' # Check for quoted empty strings vs doublequotes if ',""' not in response.content and '""' in response.content: dialect.doublequote = True projects = list(DictReader(data, dialect=dialect)) except csv.Error: projects = list(DictReader(data)) # Decode everything to unicode objects. for (index, proj) in enumerate(projects): projects[index] = dict([(k.decode('utf8'), v.decode('utf8')) for (k, v) in proj.items()]) # Add organization names along the way. for project in projects: project['organization_name'] = organization.name else: # Fail silently when the github url is no valid if type(data) != list and data['message'] == u'Not Found': return [] # If projects_list_url is a json file if len(data) and type(data[0]) in (str, unicode): # Likely that the JSON data is a simple list of strings projects = [dict(organization_name=organization.name, code_url=item) for item in data] elif len(data) and type(data[0]) is dict: # Map data to name, description, link_url, code_url (skip type, categories) # all keys don't always exist projects = [] for project in data: new_project = {} new_project['organization_name'] = organization.name if "name" in project: new_project["name"] = project["name"] if "description" in project: new_project["description"] = project["description"] if "homepage" in project: new_project["link_url"] = project["homepage"] if "html_url" in project: new_project["code_url"] = project["html_url"] projects.append(new_project) elif len(data): raise Exception('Unknown type for first project: "%s"' % repr(type(data[0]))) else: projects = [] projects = [update_project_info(proj) for proj in projects] # Filter out projects that have not been updated new_projects = [] for proj in projects: if proj is not None: new_projects.append(proj) return new_projects