def scan_sampling_iters(fd: TextIO, config_dict: Dict[str, Any], lineno: int) -> int: """ Parse sampling iteration, save number of iterations to config_dict. """ draws_found = 0 num_cols = len(config_dict['column_names']) cur_pos = fd.tell() line = fd.readline().strip() while len(line) > 0 and not line.startswith('#'): lineno += 1 draws_found += 1 data = line.split(',') if len(data) != num_cols: raise ValueError( 'line {}: bad draw, expecting {} items, found {}\n'.format( lineno, num_cols, len(line.split(','))) + 'This error could be caused by running out of disk space.\n' 'Try clearing up TEMP or setting output_dir to a path' ' on another drive.', ) cur_pos = fd.tell() line = fd.readline().strip() config_dict['draws_sampling'] = draws_found fd.seek(cur_pos) return lineno
def get_file_size(f: typing.TextIO) -> int: """ Gets file size. This function restores the file position. """ restore = f.tell() f.seek(0, 2) ret = f.tell() f.seek(restore) return ret
def get_file_size(file: TextIO) -> int: """Get size of file in bytes without altering file""" starting_pos = file.tell() file.seek(0, os.SEEK_END) file_size = file.tell() file.seek(0, starting_pos) return file_size
def scan_config(fd: TextIO, config_dict: Dict[str, Any], lineno: int) -> int: """ Scan initial stan_csv file comments lines and save non-default configuration information to config_dict. """ cur_pos = fd.tell() line = fd.readline().strip() while len(line) > 0 and line.startswith('#'): lineno += 1 if line.endswith('(Default)'): line = line.replace('(Default)', '') line = line.lstrip(' #\t') key_val = line.split('=') if len(key_val) == 2: if key_val[0].strip() == 'file' and not key_val[1].endswith('csv'): config_dict['data_file'] = key_val[1].strip() elif key_val[0].strip() != 'file': raw_val = key_val[1].strip() val: Union[int, float, str] try: val = int(raw_val) except ValueError: try: val = float(raw_val) except ValueError: val = raw_val config_dict[key_val[0].strip()] = val cur_pos = fd.tell() line = fd.readline().strip() fd.seek(cur_pos) return lineno
def scan_draws(fp: TextIO, config_dict: Dict, lineno: int) -> int: """ Parse draws, check elements per draw, save num draws to config_dict. """ draws_found = 0 num_cols = len(config_dict['column_names']) cur_pos = fp.tell() line = fp.readline().strip() first_draw = None while len(line) > 0 and not line.startswith('#'): lineno += 1 draws_found += 1 data = line.split(',') if len(data) != num_cols: raise ValueError( 'line {}: bad draw, expecting {} items, found {}'.format( lineno, num_cols, len(line.split(',')))) if first_draw is None: first_draw = np.array(data, dtype=np.float64) cur_pos = fp.tell() line = fp.readline().strip() config_dict['draws'] = draws_found config_dict['first_draw'] = first_draw fp.seek(cur_pos) return lineno
def _last(_f: TextIO, _l: int): while True: try: _f.seek(-1 * _buffer, os.SEEK_END) except IOError: _f.seek(0) found = _f.readlines() if len(found) >= _l or _f.tell() == 0: return found[-_l:], _f.tell()
def _read_file_chunk(file: TextIO, chunksize: int) -> str: """ Reads a chunk starting from `chunksize` before file pointer and up to current file pointer If `chunksize` is larger than the current file pointer, the file is read from the beginning Returns the read content in reverse order and moves the file pointer to where the content starts Reverse order is used, as it will be mostly faster to search for newlines, especially if there are many lines in a given chunk """ mov = file.tell() - max(file.tell() - chunksize, 0) file.seek(file.tell() - mov) reversed_content = file.read(mov)[::-1] file.seek(file.tell() - mov) return reversed_content
def scan_warmup(fp: TextIO, config_dict: Dict, lineno: int) -> int: """ Check warmup iterations, if any. """ if 'save_warmup' not in config_dict: return lineno cur_pos = fp.tell() line = fp.readline().strip() while len(line) > 0 and not line.startswith('#'): lineno += 1 cur_pos = fp.tell() line = fp.readline().strip() fp.seek(cur_pos) return lineno
def _read_game(self, infile: typing.TextIO): """Read events and generate data until a new game is declared in the file. Args: infile: An open buffer reading the event file. Yields: A row of tabular data for every play in the game. """ while True: prev_loc = infile.tell() line = infile.readline() if not line: return fields = line.strip().split(",") if fields[0] == "id": infile.seek(prev_loc) return if fields[0] in ["start", "sub"]: pid: str = fields[1] pos: int = int(fields[5]) lineup = self.h_lineup if int(fields[3]) else self.v_lineup lineup[pos - 1] = pid continue if fields[0] == "play": self._process_play(*fields[1:]) yield self._current_event self._current_event = None
def parse_conllu_plus_fields( in_file: T.TextIO, metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None ) -> T.Optional[T.Sequence[str]]: pos = in_file.tell() # Get first line try: first_sentence = next(parse_sentences(in_file)) first_line = first_sentence.split("\n")[0] except StopIteration: first_line = "" # parse_sentences moves to file cursor, so reset it here in_file.seek(pos) if not first_line.startswith("#"): return None tokenlist = parse_token_and_metadata(first_line, metadata_parsers=metadata_parsers) metadata = tokenlist.metadata fields = None if "global.columns" in metadata and metadata["global.columns"]: fields = [ value.lower() for value in metadata["global.columns"].split(" ") ] return fields
def get_mzXMLs_from_pep_xml(pepxml_file: typing.TextIO): REC = re.compile(' base_name="(.+?)"') currpos = pepxml_file.tell() t = pepxml_file.read() pepxml_file.seek(currpos) paths = map(pathlib.Path, REC.findall(t)) return [path.with_suffix(".mzXML") for path in paths if path.is_absolute()]
def from_file(cls, fp: TextIO, skip_line=2): id = fp.readline() # print(id) if id is None: raise RuntimeError('No more line exist in {}'.format(fp)) for i in range(skip_line): fp.readline() size = eval(fp.readline()) data = [] for i in range(size): now_position = fp.tell() line = fp.readline() try: x, y = map(float, line.split()) data.append([x, y]) except Exception as e: logger.warning( f"Except point number in {fp} is {size} but only get {i} points" ) # 拟合数据中的数据长度不等于self,_size fp.seek(now_position) # 回到上一行 size = i # 更新点的数量 break return SinglePoint(size, data)
def _new_game(self, game_id: str, infile: typing.TextIO): """Consumes lines describing game metadata from the event file. Args: game_id: Identifier for the game. infile: Open buffer reading the event file. Returns: None. """ self.current_game = {"id": game_id} while True: prev_loc = infile.tell() line = infile.readline() if not line: raise Exception("Encountered EOF while parsing new game info") fields = line.strip().split(",") if fields[0] not in ("version", "info"): infile.seek(prev_loc) return if fields[0] == "info": field, value = fields[1:] self.current_game[field] = value if field == "visteam": self.v_roster = data.get_roster(self.year, value) elif field == "hometeam": self.h_roster = data.get_roster(self.year, value)
def scan_warmup_iters(fd: TextIO, config_dict: Dict, lineno: int) -> int: """ Check warmup iterations, if any. """ if 'save_warmup' not in config_dict: return lineno cur_pos = fd.tell() line = fd.readline().strip() draws_found = 0 while len(line) > 0 and not line.startswith('#'): lineno += 1 draws_found += 1 cur_pos = fd.tell() line = fd.readline().strip() fd.seek(cur_pos) config_dict['draws_warmup'] = draws_found return lineno
def dump_mordict(self, buffer: typing.TextIO, with_comments: bool = True) -> typing.NoReturn: if self.enabled: self.phon.dump_mordict(buffer, with_comments) buffer.write("\t") self.cat.dump_mordict(buffer, with_comments) if self.sem.value: buffer.write(" ") # === END IF === self.sem.dump_mordict(buffer, with_comments) if self.gloss.value: buffer.write(" ") # === END IF === self.gloss.dump_mordict(buffer, with_comments) else: # discard contents and just dump comments disabled_entry = io.StringIO() # type: io.StringIO self.phon.dump_mordict(disabled_entry, with_comments) buffer.write("\t") self.cat.dump_mordict(disabled_entry, with_comments) if self.sem.value: buffer.write(" ") # === END IF === self.sem.dump_mordict(disabled_entry, with_comments) if self.gloss.value: buffer.write(" ") # === END IF === self.gloss.dump_mordict(disabled_entry, with_comments) disabled_entry_res = disabled_entry.getvalue().replace( "\r\n", " ").replace("\n", " ").replace("\r", " ") # type: str buffer.writelines(( "% DISABLED: ", disabled_entry_res, )) # === END IF === if with_comments: for c in self.comments: c.dump_mordict(buffer, with_comments=True) # === END FOR c === # === END IF === if buffer.seekable(): buffer.seek(buffer.tell() - 1) last_char = buffer.read() # type: str if last_char not in "\n\r": buffer.write("\n") # === END IF === else: buffer.write("\n")
def run_csv2rdf(csv_filename: str, metadata_filename: str, csv_io: TextIO, metadata_io: TextIO, codelists_base: Optional[str] = None): client = docker.from_env() csv2rdf = client.containers.create( 'gsscogs/csv2rdf', command= f'csv2rdf -m annotated -o /tmp/output.ttl -t /tmp/{csv_filename} -u /tmp/{metadata_filename}' ) archive = BytesIO() metadata_io.seek(0, SEEK_END) metadata_size = metadata_io.tell() metadata_io.seek(0) csv_io.seek(0, SEEK_END) csv_size = csv_io.tell() csv_io.seek(0) with TarFile(fileobj=archive, mode='w') as t: tis = TarInfo(str(metadata_filename)) tis.size = metadata_size tis.mtime = time.time() t.addfile(tis, BytesIO(metadata_io.read().encode('utf-8'))) tic = TarInfo(str(csv_filename)) tic.size = csv_size tic.mtime = time.time() t.addfile(tic, BytesIO(csv_io.read().encode('utf-8'))) if codelists_base is not None: t.add(Path('features') / 'fixtures' / codelists_base, arcname=codelists_base) archive.seek(0) csv2rdf.put_archive('/tmp/', archive) csv2rdf.start() response = csv2rdf.wait() sys.stdout.write(csv2rdf.logs().decode('utf-8')) assert_equal(response['StatusCode'], 0) output_stream, output_stat = csv2rdf.get_archive('/tmp/output.ttl') output_archive = BytesIO() for line in output_stream: output_archive.write(line) output_archive.seek(0) with TarFile(fileobj=output_archive, mode='r') as t: output_ttl = t.extractfile('output.ttl') return output_ttl.read()
def sniff_reader(file: typing.TextIO, num: int = 40) -> ReadFuncTy: pos = file.tell() header = file.readline() sample = header + "".join(itertools.islice(file, num - 1)) # read first N lines to sniff ret = None try: b = sniff_json(sample) except ValueError: try: dialect = csv.Sniffer().sniff(sample, (";", ",")) except csv.Error: try: linereader = sniff_xmage(sample.splitlines()) except ValueError: linereader = sniff_plain(sample.splitlines()) mylogger.MAINLOGGER.info("Plain text guessed") ret = lambda fp: read_txt(fp, line_reader=linereader) else: mylogger.MAINLOGGER.info("Xmage save file guessed") ret = lambda fp: read_xmage_deck(fp, line_reader=linereader) else: mylogger.MAINLOGGER.info("CSV input guessed") v = csv.reader([header], dialect=dialect) line = list(map(str.lower, next(v))) count_column = line.index("count") name_column = line.index("name") try: section_column = line.index("section") except ValueError: section_column = None try: version_column = line.index("edition") except ValueError: version_column = None try: collectors_num_column = line.index("card number") except ValueError: collectors_num_column = None try: language_column = line.index("language") except ValueError: language_column = None ret = lambda file: read_csv(file, name_column=name_column, count_column=count_column, version_column=version_column, section_column=section_column, collectors_num_column=collectors_num_column, language_column=language_column, dialect=dialect) else: mylogger.MAINLOGGER.info("JSON file guessed") ret = lambda fp: read_json(fp) file.seek(pos) return ret
def read_params_from_txt(cls, file: TextIO): start_pos = file.tell() header = file.readline() if not header.startswith(cls.header_n_params_key): warnings.warn('File has no header parameters') file.seek(start_pos) return json_length = int(header.split()[1]) + 1 # 1 for last '\n' params = json.loads(file.read(json_length), cls=ReprJSONDecoder) return params
def should_rotate(message: loguru.Message, file: typing.TextIO) -> bool: """When should the bot rotate : Once in 1 week or if the size is greater than 5 MB.""" filepath = os.path.abspath(file.name) creation = os.path.getmtime(filepath) now = message.record["time"].timestamp() max_time = 7 * 24 * 60 * 60 # 1 week in seconds if file.tell() + len(message) > 5 * (2**20): # if greater than size 5 MB return True if now - creation > max_time: return True return False
def write_to_file(self, file_stream: TextIO, cards: Iterator[Card]): writer = csv.writer(file_stream, delimiter=",") if file_stream.tell() == 0: writer.writerow( ["uid", "level", "vip_type", "vip_status", "timestamp"]) for card in cards: user = card.user writer.writerow([ user.uid, user.level, user.vip_type, user.vip_status, card.timestamp ])
def detect_eval(fp: TextIO) -> Callable[[str], TParseRet]: pos = fp.tell() line = fp.readline() fp.seek(pos) if "rbp_eval" in line: parse_func = rbp_parse elif "runid" in line: parse_func = gdeval_parse else: parse_func = trec_parse return parse_func
def scan_sampling_iters(fd: TextIO, config_dict: Dict, lineno: int) -> int: """ Parse sampling iteration, save number of iterations to config_dict. """ draws_found = 0 num_cols = len(config_dict['column_names']) cur_pos = fd.tell() line = fd.readline().strip() while len(line) > 0 and not line.startswith('#'): lineno += 1 draws_found += 1 data = line.split(',') if len(data) != num_cols: raise ValueError( 'line {}: bad draw, expecting {} items, found {}'.format( lineno, num_cols, len(line.split(',')))) cur_pos = fd.tell() line = fd.readline().strip() config_dict['draws_sampling'] = draws_found fd.seek(cur_pos) return lineno
def scan_config(fp: TextIO, config_dict: Dict, lineno: int) -> int: """ Scan initial stan_csv file comments lines and save non-default configuration information to config_dict. """ cur_pos = fp.tell() line = fp.readline().strip() while len(line) > 0 and line.startswith('#'): lineno += 1 if not line.endswith('(Default)'): line = line.lstrip(' #\t') key_val = line.split('=') if len(key_val) == 2: if key_val[0].strip( ) == 'file' and not key_val[1].endswith('csv'): config_dict['data_file'] = key_val[1].strip() elif key_val[0].strip() != 'file': config_dict[key_val[0].strip()] = key_val[1].strip() cur_pos = fp.tell() line = fp.readline().strip() fp.seek(cur_pos) return lineno
def _retrieve_last_line_of_file(f: typing.TextIO, read_chunk_size: int = 100) -> str: """ Retrieve the last line of the file. From: https://stackoverflow.com/a/7167316/12907985 Args: f: File-like object. read_chunk_size: Size of step in bytes to read backwards into the file. Default: 100. Returns: Last line of file, assuming it's found. """ last_line = "" while True: # We grab chunks from the end of the file towards the beginning until we # get a new line # However, we need to be more careful because seeking directly from the end # of a text file is apparently undefined behavior. To address this, we # follow the approach from here: https://stackoverflow.com/a/51131242/12907985 # First, move to the end of the file. f.seek(0, os.SEEK_END) # Then, just back from the current position (based on SEEK_SET and tell() # of the current position). f.seek(f.tell() - len(last_line) - read_chunk_size, os.SEEK_SET) # NOTE: This chunk isn't necessarily going back read_chunk_size characters, but # rather just some number of bytes. Depending on the encoding, it may be. # In any case, we don't care - we're just looking for the last line. chunk = f.read(read_chunk_size) if not chunk: # The whole file is one big line return last_line if not last_line and chunk.endswith('\n'): # Ignore the trailing newline at the end of the file (but include it # in the output). last_line = '\n' chunk = chunk[:-1] nl_pos = chunk.rfind('\n') # What's being searched for will have to be modified if you are searching # files with non-unix line endings. last_line = chunk[nl_pos + 1:] + last_line if nl_pos == -1: # The whole chunk is part of the last line. continue return last_line
def reader(f: TextIO, field_cnt: int) -> CsvReader: """ Replaces the native CSV reader since it strips quotes when the field doesn't contain a comma... Disabling quoting breaks when the field does contain a comma but fixes when the sub delimiters are present. The header will be skipped if present in the CSV file. Looking for the SEQ header should be sufficient to skip """ inst = _CsvReaderImpl(f, field_cnt) pos = f.tell() has_header = f.read(4).upper().startswith("SEQ|") if has_header: f.readline() else: f.seek(pos) return inst
def read_plot_data(in_file: TextIO) -> pd.DataFrame: """Read an AFL `plot_data` file.""" def fix_map_size(x): if isinstance(x, str): return float(x.split('%')[0]) return x # Skip the opening '# ' (if it exists) pos = in_file.tell() first_chars = in_file.read(2) if first_chars != '# ': in_file.seek(pos) # Read the data df = pd.read_csv(in_file, index_col=False, skipinitialspace=True) df.map_size = df.map_size.apply(fix_map_size) return df
def parse_quant_exprs( file: typing.TextIO, var_num: int, ) -> (typing.List[QuantExpr], int): """ :param file: File handler with cursor at the first quantifier line :param var_num: Number of variables :return: The parsed quant expressions and the line position of the SAT part """ quants = [] used_vars = [0 for _ in range(var_num)] # Enforce invariants while True: line_pos = file.tell() line = file.readline() tokens = line.split(' ') if tokens[0] != 'a' and tokens[0] != 'e': for i in range(var_num): if used_vars[i] == 0: raise_qdimacs_exception('variable {} is unbound'.format(i + 1)) break quantifier = Quantifier.Exists if tokens[ 0] == 'a' else Quantifier.Forall int_tokens = list(map(lambda x: int(x), tokens[1:])) # Truncate the the last '0' token int_tokens = int_tokens[:-1] # Bookkeeping for token in int_tokens: if used_vars[token - 1] != 0: raise Exception( "variable {} has been bound multiple times".format(token)) used_vars[token - 1] = 1 quants.append(QuantExpr(quantifier, int_tokens)) return quants, line_pos
def deck_load(inp: TextIO, name: str) -> Deck: """Parse single deck from a formatted checkpoint in a filelike object.""" start_pos = inp.tell() for line in inp: if name in line: break else: inp.seek(start_pos) # return to start pos raise AttributeError(f"{name} could not be found in input string.") rest = line + "\n" + inp.read() # read rest of file inp.seek(start_pos) # return to start_pos # parse rest group = deck.parseString(rest)[0] my_name = "".join(group["key"]) if my_name.strip() != name.strip(): raise RuntimeWarning( f"expected deck {name}, but got a deck named {my_name}") return _to_deck(group)
def skip_initial_comment(f_stream: TextIO) -> int: """ Initial comment in ~/.pg_service.conf is not always marked with '#' which crashes the parser. This function takes a file object and "rewinds" it to the beginning of the first section, from where on it can be parsed safely :return: number of skipped lines """ section_regex = r"\s*\[" pos = f_stream.tell() lines_skipped = 0 while True: line = f_stream.readline() if line == "": break if re.match(section_regex, line) is not None: f_stream.seek(pos) break else: pos += len(line) lines_skipped += 1 return lines_skipped
def peek(file: TextIO) -> str: pos = file.tell() line = file.readline() file.seek(pos) return line