def pipe_to_file(response, path, size=0): """Pull the data off an HTTP response, shove it in a new file, and show progress. :arg response: A file-like object to read from :arg path: The path of the new file :arg size: The expected size, in bytes, of the download. 0 for unknown or to suppress progress indication (as for cached downloads) """ def response_chunks(chunk_size): while True: chunk = response.read(chunk_size) if not chunk: break yield chunk print('Downloading %s%s...' % (self._req.req, (' (%sK)' % (size / 1000)) if size > 1000 else '')) progress_indicator = (DownloadProgressBar( max=size).iter if size else DownloadProgressSpinner().iter) with open(path, 'wb') as file: for chunk in progress_indicator(response_chunks(4096), 4096): file.write(chunk)
def _download_url(resp, link, content_file, hashes): try: total_length = int(resp.headers['content-length']) except (ValueError, KeyError, TypeError): total_length = 0 cached_resp = getattr(resp, "from_cache", False) if logger.getEffectiveLevel() > logging.INFO: show_progress = False elif cached_resp: show_progress = False elif total_length > (40 * 1000): show_progress = True elif not total_length: show_progress = True else: show_progress = False show_url = link.show_url def resp_read(chunk_size): try: # Special case for urllib3. for chunk in resp.raw.stream( chunk_size, # We use decode_content=False here because we don't # want urllib3 to mess with the raw bytes we get # from the server. If we decompress inside of # urllib3 then we cannot verify the checksum # because the checksum will be of the compressed # file. This breakage will only occur if the # server adds a Content-Encoding header, which # depends on how the server was configured: # - Some servers will notice that the file isn't a # compressible file and will leave the file alone # and with an empty Content-Encoding # - Some servers will notice that the file is # already compressed and will leave the file # alone and will add a Content-Encoding: gzip # header # - Some servers won't notice anything at all and # will take a file that's already been compressed # and compress it again and set the # Content-Encoding: gzip header # # By setting this not to decode automatically we # hope to eliminate problems with the second case. decode_content=False): yield chunk except AttributeError: # Standard file-like object. while True: chunk = resp.raw.read(chunk_size) if not chunk: break yield chunk def written_chunks(chunks): for chunk in chunks: content_file.write(chunk) yield chunk progress_indicator = _progress_indicator if link.netloc == PyPI.netloc: url = show_url else: url = link.url_without_fragment if show_progress: # We don't show progress on cached responses if total_length: logger.info("Downloading %s (%s)", url, format_size(total_length)) progress_indicator = DownloadProgressBar(max=total_length).iter else: logger.info("Downloading %s", url) progress_indicator = DownloadProgressSpinner().iter elif cached_resp: logger.info("Using cached %s", url) else: logger.info("Downloading %s", url) logger.debug('Downloading from URL %s', link) downloaded_chunks = written_chunks( progress_indicator( resp_read(CONTENT_CHUNK_SIZE), CONTENT_CHUNK_SIZE ) ) if hashes: hashes.check_against_chunks(downloaded_chunks) else: consume(downloaded_chunks)
def _download_url(resp, link, content_file): download_hash = None if link.hash and link.hash_name: try: download_hash = hashlib.new(link.hash_name) except ValueError: logger.warning( "Unsupported hash name %s for package %s", link.hash_name, link, ) try: total_length = int(resp.headers['content-length']) except (ValueError, KeyError, TypeError): total_length = 0 cached_resp = getattr(resp, "from_cache", False) if logger.getEffectiveLevel() > logging.INFO: show_progress = False elif cached_resp: show_progress = False elif total_length > (40 * 1000): show_progress = True elif not total_length: show_progress = True else: show_progress = False show_url = link.show_url try: def resp_read(chunk_size): try: # Special case for urllib3. for chunk in resp.raw.stream( chunk_size, # We use decode_content=False here because we do # want urllib3 to mess with the raw bytes we get # from the server. If we decompress inside of # urllib3 then we cannot verify the checksum # because the checksum will be of the compressed # file. This breakage will only occur if the # server adds a Content-Encoding header, which # depends on how the server was configured: # - Some servers will notice that the file isn't a # compressible file and will leave the file alone # and with an empty Content-Encoding # - Some servers will notice that the file is # already compressed and will leave the file # alone and will add a Content-Encoding: gzip # header # - Some servers won't notice anything at all and # will take a file that's already been compressed # and compress it again and set the # Content-Encoding: gzip header # # By setting this not to decode automatically we # hope to eliminate problems with the second case. decode_content=False): yield chunk except AttributeError: # Standard file-like object. while True: chunk = resp.raw.read(chunk_size) if not chunk: break yield chunk progress_indicator = lambda x, *a, **k: x if link.netloc == PyPI.netloc: url = show_url else: url = link.url_without_fragment if show_progress: # We don't show progress on cached responses if total_length: logger.info( "Downloading %s (%s)", url, format_size(total_length), ) progress_indicator = DownloadProgressBar( max=total_length, ).iter else: logger.info("Downloading %s", url) progress_indicator = DownloadProgressSpinner().iter elif cached_resp: logger.info("Using cached %s", url) else: logger.info("Downloading %s", url) logger.debug('Downloading from URL %s', link) for chunk in progress_indicator(resp_read(4096), 4096): if download_hash is not None: download_hash.update(chunk) content_file.write(chunk) finally: if link.hash and link.hash_name: _check_hash(download_hash, link) return download_hash
def anonymize_dicomdir(inputdir, outdir, write_logs=True): """ Anonymize all DICOM files of the input directory. Parameters ---------- inputdir: str (mandatory) A folder that contains only DICOM files to be anonymized. outdir: str (mandatory) The anonimized DICOM files folder. write_logs: bool (optional, default True) If True write the anonimization logs. Returns ------- dcmfiles: str The anonimized DICOM files. logfiles: list The anonimization log files. """ # Load the first dataset input_dicoms = [os.path.join(inputdir, fname) for fname in os.listdir(inputdir)] dataset = dicom.read_file(input_dicoms[0], force=True) # Load the tags to anonymize filedir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(filedir, "deidentify.json"), "r") as open_file: anon_tags = json.load(open_file)[1:] # Set up the desired callbacks and tags to be anonymized # Iterate over all the tag to anonymize according to PS 3.15-2008 and # supplement 142 for tag_item in anon_tags: tag_repr = tag_item["Tag"][1:-1] action = tag_item["Basic Profile"] group, element = tag_repr.split(",", 1) # Deal with special tags if "xx" in group or "xx" in element: pattern = re.compile(tag_repr.replace("x", "[0-9A-Fa-f]")) CALLBACKS[tag_repr] = [pattern, callback_xxxx] # Deal with private tags elif "gggg" in group: if (0x0008, 0x0070) in dataset: MANUFACTURER.append(dataset[0x0008, 0x0070].value) if len(MANUFACTURER) > 0: CALLBACKS[tag_repr] = [None, callback_private] else: raise Exception( "The '(0008,0070)' manufacturer tag is not specified and " "is required to anonymize private tags.") # Deal with standard tags else: TAGS[tag_repr] = (int(group, 16), int(element, 16)), action # Now compile the diffusion private tags patterns filedir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(filedir, "private_deidentify.json"), "r") as open_file: private_anons = json.load(open_file) for key, values in private_anons.items(): for value in values: pattern = re.compile(value["Tag"].replace("x", "[0-9A-Fa-f]")) PRIVATE_DEIDENTIFY.setdefault(key, []).append(pattern) # Process all DICOM files progress_indicator = DownloadProgressBar(max=len(input_dicoms)) dcmfiles = [] logfiles = [] for cnt, input_dicom in enumerate(input_dicoms): statinfo = os.stat(input_dicom) DownloadProgressBar.suffix = "{0:.3f}MB".format( statinfo.st_size / 10e5) progress_indicator.next(1) output_dicom, output_log = anonymize_dicomfile( input_dicom, outdir, outname=str(cnt), write_log=write_logs) dcmfiles.append(output_dicom) logfiles.append(output_log) progress_indicator.finish() return dcmfiles, logfiles
def split_series(dicom_dir, outdir): """ Split all the folder Dicom files by series in different folders. Dicom files are searched recursively in the input folder and all files are expected to be Dicom files. Expect to split files from a single session. Parameters ---------- dicom_dir: str (mandatory) a folder containing Dicom files to organize by series. outdir: str (mandatory) the destination folder. """ # Read the incoming directory: # process each file in this directory and its sub-directories # expect each file to be a DICOM file to_treat_dicom = [] for root, dirs, files in os.walk(dicom_dir): to_treat_dicom.extend([ os.path.join(root, basename) for basename in files]) # Go through each file: expected to be in Dicom format progress_indicator = DownloadProgressBar(max=len(to_treat_dicom)) acquisition_datetime = None for dicom_file in to_treat_dicom: # Update progress bar statinfo = os.stat(dicom_file) DownloadProgressBar.suffix = "{0:.3f}MB".format( statinfo.st_size / 10e5) progress_indicator.next(1) # Get the time of last modification mtime = os.path.getmtime(dicom_file) # Read DICOM dataset dataset = dicom.read_file(dicom_file) # Find character encoding of DICOM attributes: # we currently expect encoding to be ISO_IR 100 if (0x0008, 0x0005) in dataset: SpecificCharacterSet = dataset[0x0008, 0x0005].value if SpecificCharacterSet != "ISO_IR 100": print("'{0}' file encoding is not ISO_IR 100 as " "expected.".format(dicom_file)) continue else: print("Can't check encoding of '{0}', missing (0x0008, 0x0005) " "tag.".format(dicom_file)) # Process other DICOM attributes: # decode strings assuming 'ISO_IR 100' SeriesDescription = None SOPInstanceUID = dataset[0x0008, 0x0018].value if (0x0008, 0x103e) in dataset: SeriesDescription = cleanup(decode(dataset[0x0008, 0x103e].value)) SeriesNumber = dataset[0x0020, 0x0011].value EchoTime = dataset[0x0018, 0x0081].value # Check the session time current_acquisition_datetime = (dataset[0x0008, 0x0020].value + dataset[0x0008, 0x0030].value) if acquisition_datetime is None: acquisition_datetime = current_acquisition_datetime elif acquisition_datetime != current_acquisition_datetime: raise ValueError( "Two sessions detected in the input folder '{0}': {1} - " "{2}.".format(dicom_dir, acquisition_datetime, current_acquisition_datetime)) # Build the full path to the outgoing directory: # we assume that there is only one session if SeriesDescription: serie_name = (SeriesDescription + "_" + str(EchoTime) + "_" + str(SeriesNumber).rjust(6, "0")) else: serie_name = str(EchoTime) + "_" + str(SeriesNumber).rjust(6, "0") output_dicom_dir = os.path.join(outdir, serie_name) # Check that the destination folder exists if not os.path.isdir(output_dicom_dir): os.mkdir(output_dicom_dir) # Build a new name for the DICOM file output_dicom_file = os.path.join(output_dicom_dir, SOPInstanceUID + '.dcm') # Copy DICOM file: # handle case where outgoing file already exists if os.path.exists(output_dicom_file): # Compare modification time and keep the most recent file if os.path.getmtime(output_dicom_file) < mtime: shutil.copy2(dicom_file, output_dicom_file) # file does not exists and can be copied else: shutil.copy2(dicom_file, output_dicom_file)
if each_clazz_num * CLAZZ_NUM < len(conf_rat_list): clazz_list[-1] += conf_rat_list[each_clazz_num * CLAZZ_NUM:] for clazz_index, clazz in enumerate(clazz_list): for j in clazz: clazz_dict[list(j.keys())[0]] = clazz_index for i in conf_rat_list: conf_rat_dict[list(i.keys())[0]] = list(i.values())[0] conf_rank() dataset = open(sys.argv[1]).read().split("\n")[1:] output = open(sys.argv[2], "a") all_count = len(dataset) bar = DownloadProgressBar(max=all_count - 1) value = 0 output.write(get_features("", head=True)) print("正在生成全部特征 ...") for i in bar.iter(range(all_count - 1)): line = dataset[value] author_name = line.split(",")[0] if author_name != "": feature = get_features(author_name) output.write(feature) value += 1 def check_dimension(): s = open(sys.argv[2], "r") print("正在检查维数")
def split_series(dicom_dir, outdir): """ Split all the folder Dicom files by series in different folders. Dicom files are searched recursively in the input folder and all files are expected to be Dicom files. Expect to split files from a single session. Parameters ---------- dicom_dir: str (mandatory) a folder containing Dicom files to organize by series. outdir: str (mandatory) the destination folder. """ # Read the incoming directory: # process each file in this directory and its sub-directories # expect each file to be a DICOM file to_treat_dicom = [] for root, dirs, files in os.walk(dicom_dir): to_treat_dicom.extend( [os.path.join(root, basename) for basename in files]) # Go through each file: expected to be in Dicom format progress_indicator = DownloadProgressBar(max=len(to_treat_dicom)) acquisition_datetime = None for dicom_file in to_treat_dicom: # Update progress bar statinfo = os.stat(dicom_file) DownloadProgressBar.suffix = "{0:.3f}MB".format(statinfo.st_size / 10e5) progress_indicator.next(1) # Get the time of last modification mtime = os.path.getmtime(dicom_file) # Read DICOM dataset dataset = dicom.read_file(dicom_file) # Find character encoding of DICOM attributes: # we currently expect encoding to be ISO_IR 100 if (0x0008, 0x0005) in dataset: SpecificCharacterSet = dataset[0x0008, 0x0005].value if SpecificCharacterSet != "ISO_IR 100": print("'{0}' file encoding is not ISO_IR 100 as " "expected.".format(dicom_file)) continue else: print("Can't check encoding of '{0}', missing (0x0008, 0x0005) " "tag.".format(dicom_file)) # Process other DICOM attributes: # decode strings assuming 'ISO_IR 100' SeriesDescription = None SOPInstanceUID = dataset[0x0008, 0x0018].value if (0x0008, 0x103e) in dataset: SeriesDescription = cleanup(decode(dataset[0x0008, 0x103e].value)) SeriesNumber = dataset[0x0020, 0x0011].value EchoTime = dataset[0x0018, 0x0081].value # Check the session time current_acquisition_datetime = (dataset[0x0008, 0x0020].value + dataset[0x0008, 0x0030].value) if acquisition_datetime is None: acquisition_datetime = current_acquisition_datetime elif acquisition_datetime != current_acquisition_datetime: raise ValueError( "Two sessions detected in the input folder '{0}': {1} - " "{2}.".format(dicom_dir, acquisition_datetime, current_acquisition_datetime)) # Build the full path to the outgoing directory: # we assume that there is only one session if SeriesDescription: serie_name = (SeriesDescription + "_" + str(EchoTime) + "_" + str(SeriesNumber).rjust(6, "0")) else: serie_name = str(EchoTime) + "_" + str(SeriesNumber).rjust(6, "0") output_dicom_dir = os.path.join(outdir, serie_name) # Check that the destination folder exists if not os.path.isdir(output_dicom_dir): os.mkdir(output_dicom_dir) # Build a new name for the DICOM file output_dicom_file = os.path.join(output_dicom_dir, SOPInstanceUID + '.dcm') # Copy DICOM file: # handle case where outgoing file already exists if os.path.exists(output_dicom_file): # Compare modification time and keep the most recent file if os.path.getmtime(output_dicom_file) < mtime: shutil.copy2(dicom_file, output_dicom_file) # file does not exists and can be copied else: shutil.copy2(dicom_file, output_dicom_file)
def anonymize_dicomdir(inputdir, outdir, write_logs=True): """ Anonymize all DICOM files of the input directory. Parameters ---------- inputdir: str (mandatory) A folder that contains only DICOM files to be anonymized. outdir: str (mandatory) The anonimized DICOM files folder. write_logs: bool (optional, default True) If True write the anonimization logs. Returns ------- dcmfiles: str The anonimized DICOM files. logfiles: list The anonimization log files. """ # Load the first dataset input_dicoms = [ os.path.join(inputdir, fname) for fname in os.listdir(inputdir) ] dataset = dicom.read_file(input_dicoms[0], force=True) # Load the tags to anonymize filedir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(filedir, "deidentify.json"), "r") as open_file: anon_tags = json.load(open_file)[1:] # Set up the desired callbacks and tags to be anonymized # Iterate over all the tag to anonymize according to PS 3.15-2008 and # supplement 142 for tag_item in anon_tags: tag_repr = tag_item["Tag"][1:-1] action = tag_item["Basic Profile"] group, element = tag_repr.split(",", 1) # Deal with special tags if "xx" in group or "xx" in element: pattern = re.compile(tag_repr.replace("x", "[0-9A-Fa-f]")) CALLBACKS[tag_repr] = [pattern, callback_xxxx] # Deal with private tags elif "gggg" in group: if (0x0008, 0x0070) in dataset: MANUFACTURER.append(dataset[0x0008, 0x0070].value) if len(MANUFACTURER) > 0: CALLBACKS[tag_repr] = [None, callback_private] else: raise Exception( "The '(0008,0070)' manufacturer tag is not specified and " "is required to anonymize private tags.") # Deal with standard tags else: TAGS[tag_repr] = (int(group, 16), int(element, 16)), action # Now compile the diffusion private tags patterns filedir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(filedir, "private_deidentify.json"), "r") as open_file: private_anons = json.load(open_file) for key, values in private_anons.items(): for value in values: pattern = re.compile(value["Tag"].replace("x", "[0-9A-Fa-f]")) PRIVATE_DEIDENTIFY.setdefault(key, []).append(pattern) # Process all DICOM files progress_indicator = DownloadProgressBar(max=len(input_dicoms)) dcmfiles = [] logfiles = [] for cnt, input_dicom in enumerate(input_dicoms): statinfo = os.stat(input_dicom) DownloadProgressBar.suffix = "{0:.3f}MB".format(statinfo.st_size / 10e5) progress_indicator.next(1) output_dicom, output_log = anonymize_dicomfile(input_dicom, outdir, outname=str(cnt), write_log=write_logs) dcmfiles.append(output_dicom) logfiles.append(output_log) progress_indicator.finish() return dcmfiles, logfiles
rs['author'] = info[2:].split(",") elif info[:2] == '#t': rs['year'] = int(info[2:]) elif info[:2] == '#c': rs['conf'] = info[2:] elif info[:2] == '#%': rs['ref'].append(int(info[2:])) if rs['year'] == None: print(s) if rs['year'] > 2013: return False return index, rs paper_dict = dict() all_count = len(papers) bar_count = int(ceil(len(papers) / 1000)) bar = DownloadProgressBar(max=bar_count) _i = 0 for i in bar.iter(range(bar_count)): for j in range(_i * 1000, (_i + 1) * 1000): if j == all_count: break paper = handlePaperInfo(papers[j]) if paper == False: continue paper_dict[paper[0]] = paper[1] _i += 1 papers = paper_dict print("正在构建被引用关系") cnt = 0 for p_index in papers.keys():