Esempio n. 1
0
        def pipe_to_file(response, path, size=0):
            """Pull the data off an HTTP response, shove it in a new file, and
            show progress.

            :arg response: A file-like object to read from
            :arg path: The path of the new file
            :arg size: The expected size, in bytes, of the download. 0 for
                unknown or to suppress progress indication (as for cached
                downloads)

            """
            def response_chunks(chunk_size):
                while True:
                    chunk = response.read(chunk_size)
                    if not chunk:
                        break
                    yield chunk

            print('Downloading %s%s...' %
                  (self._req.req,
                   (' (%sK)' % (size / 1000)) if size > 1000 else ''))
            progress_indicator = (DownloadProgressBar(
                max=size).iter if size else DownloadProgressSpinner().iter)
            with open(path, 'wb') as file:
                for chunk in progress_indicator(response_chunks(4096), 4096):
                    file.write(chunk)
Esempio n. 2
0
def _download_url(resp, link, content_file, hashes):
    try:
        total_length = int(resp.headers['content-length'])
    except (ValueError, KeyError, TypeError):
        total_length = 0

    cached_resp = getattr(resp, "from_cache", False)

    if logger.getEffectiveLevel() > logging.INFO:
        show_progress = False
    elif cached_resp:
        show_progress = False
    elif total_length > (40 * 1000):
        show_progress = True
    elif not total_length:
        show_progress = True
    else:
        show_progress = False

    show_url = link.show_url

    def resp_read(chunk_size):
        try:
            # Special case for urllib3.
            for chunk in resp.raw.stream(
                    chunk_size,
                    # We use decode_content=False here because we don't
                    # want urllib3 to mess with the raw bytes we get
                    # from the server. If we decompress inside of
                    # urllib3 then we cannot verify the checksum
                    # because the checksum will be of the compressed
                    # file. This breakage will only occur if the
                    # server adds a Content-Encoding header, which
                    # depends on how the server was configured:
                    # - Some servers will notice that the file isn't a
                    #   compressible file and will leave the file alone
                    #   and with an empty Content-Encoding
                    # - Some servers will notice that the file is
                    #   already compressed and will leave the file
                    #   alone and will add a Content-Encoding: gzip
                    #   header
                    # - Some servers won't notice anything at all and
                    #   will take a file that's already been compressed
                    #   and compress it again and set the
                    #   Content-Encoding: gzip header
                    #
                    # By setting this not to decode automatically we
                    # hope to eliminate problems with the second case.
                    decode_content=False):
                yield chunk
        except AttributeError:
            # Standard file-like object.
            while True:
                chunk = resp.raw.read(chunk_size)
                if not chunk:
                    break
                yield chunk

    def written_chunks(chunks):
        for chunk in chunks:
            content_file.write(chunk)
            yield chunk

    progress_indicator = _progress_indicator

    if link.netloc == PyPI.netloc:
        url = show_url
    else:
        url = link.url_without_fragment

    if show_progress:  # We don't show progress on cached responses
        if total_length:
            logger.info("Downloading %s (%s)", url, format_size(total_length))
            progress_indicator = DownloadProgressBar(max=total_length).iter
        else:
            logger.info("Downloading %s", url)
            progress_indicator = DownloadProgressSpinner().iter
    elif cached_resp:
        logger.info("Using cached %s", url)
    else:
        logger.info("Downloading %s", url)

    logger.debug('Downloading from URL %s', link)

    downloaded_chunks = written_chunks(
        progress_indicator(
            resp_read(CONTENT_CHUNK_SIZE),
            CONTENT_CHUNK_SIZE
        )
    )
    if hashes:
        hashes.check_against_chunks(downloaded_chunks)
    else:
        consume(downloaded_chunks)
Esempio n. 3
0
def _download_url(resp, link, content_file):
    download_hash = None
    if link.hash and link.hash_name:
        try:
            download_hash = hashlib.new(link.hash_name)
        except ValueError:
            logger.warning(
                "Unsupported hash name %s for package %s",
                link.hash_name, link,
            )

    try:
        total_length = int(resp.headers['content-length'])
    except (ValueError, KeyError, TypeError):
        total_length = 0

    cached_resp = getattr(resp, "from_cache", False)

    if logger.getEffectiveLevel() > logging.INFO:
        show_progress = False
    elif cached_resp:
        show_progress = False
    elif total_length > (40 * 1000):
        show_progress = True
    elif not total_length:
        show_progress = True
    else:
        show_progress = False

    show_url = link.show_url
    try:
        def resp_read(chunk_size):
            try:
                # Special case for urllib3.
                for chunk in resp.raw.stream(
                        chunk_size,
                        # We use decode_content=False here because we do
                        # want urllib3 to mess with the raw bytes we get
                        # from the server. If we decompress inside of
                        # urllib3 then we cannot verify the checksum
                        # because the checksum will be of the compressed
                        # file. This breakage will only occur if the
                        # server adds a Content-Encoding header, which
                        # depends on how the server was configured:
                        # - Some servers will notice that the file isn't a
                        #   compressible file and will leave the file alone
                        #   and with an empty Content-Encoding
                        # - Some servers will notice that the file is
                        #   already compressed and will leave the file
                        #   alone and will add a Content-Encoding: gzip
                        #   header
                        # - Some servers won't notice anything at all and
                        #   will take a file that's already been compressed
                        #   and compress it again and set the
                        #   Content-Encoding: gzip header
                        #
                        # By setting this not to decode automatically we
                        # hope to eliminate problems with the second case.
                        decode_content=False):
                    yield chunk
            except AttributeError:
                # Standard file-like object.
                while True:
                    chunk = resp.raw.read(chunk_size)
                    if not chunk:
                        break
                    yield chunk

        progress_indicator = lambda x, *a, **k: x

        if link.netloc == PyPI.netloc:
            url = show_url
        else:
            url = link.url_without_fragment

        if show_progress:  # We don't show progress on cached responses
            if total_length:
                logger.info(
                    "Downloading %s (%s)", url, format_size(total_length),
                )
                progress_indicator = DownloadProgressBar(
                    max=total_length,
                ).iter
            else:
                logger.info("Downloading %s", url)
                progress_indicator = DownloadProgressSpinner().iter
        elif cached_resp:
            logger.info("Using cached %s", url)
        else:
            logger.info("Downloading %s", url)

        logger.debug('Downloading from URL %s', link)

        for chunk in progress_indicator(resp_read(4096), 4096):
            if download_hash is not None:
                download_hash.update(chunk)
            content_file.write(chunk)
    finally:
        if link.hash and link.hash_name:
            _check_hash(download_hash, link)
    return download_hash
Esempio n. 4
0
def anonymize_dicomdir(inputdir, outdir, write_logs=True):
    """ Anonymize all DICOM files of the input directory.

    Parameters
    ----------
    inputdir: str (mandatory)
        A folder that contains only DICOM files to be anonymized.
    outdir: str (mandatory)
        The anonimized DICOM files folder.
    write_logs: bool (optional, default True)
        If True write the anonimization logs.

    Returns
    -------
    dcmfiles: str
        The anonimized DICOM files.
    logfiles: list
        The anonimization log files.

    """
    # Load the first dataset
    input_dicoms = [os.path.join(inputdir, fname)
                    for fname in os.listdir(inputdir)]
    dataset = dicom.read_file(input_dicoms[0], force=True)

    # Load the tags to anonymize
    filedir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(filedir, "deidentify.json"), "r") as open_file:
        anon_tags = json.load(open_file)[1:]

    # Set up the desired callbacks and tags to be anonymized
    # Iterate over all the tag to anonymize according to PS 3.15-2008 and
    # supplement 142
    for tag_item in anon_tags:
        tag_repr = tag_item["Tag"][1:-1]
        action = tag_item["Basic Profile"]
        group, element = tag_repr.split(",", 1)

        # Deal with special tags
        if "xx" in group or "xx" in element:
            pattern = re.compile(tag_repr.replace("x", "[0-9A-Fa-f]"))
            CALLBACKS[tag_repr] = [pattern, callback_xxxx]

        # Deal with private tags
        elif "gggg" in group:
            if (0x0008, 0x0070) in dataset:
                MANUFACTURER.append(dataset[0x0008, 0x0070].value)
            if len(MANUFACTURER) > 0:
                CALLBACKS[tag_repr] = [None, callback_private]
            else:
                raise Exception(
                    "The '(0008,0070)' manufacturer tag is not specified and "
                    "is required to anonymize private tags.")

        # Deal with standard tags
        else:
            TAGS[tag_repr] = (int(group, 16), int(element, 16)), action

    # Now compile the diffusion private tags patterns
    filedir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(filedir, "private_deidentify.json"),
              "r") as open_file:
        private_anons = json.load(open_file)
    for key, values in private_anons.items():
        for value in values:
            pattern = re.compile(value["Tag"].replace("x", "[0-9A-Fa-f]"))
            PRIVATE_DEIDENTIFY.setdefault(key, []).append(pattern)

    # Process all DICOM files
    progress_indicator = DownloadProgressBar(max=len(input_dicoms))
    dcmfiles = []
    logfiles = []
    for cnt, input_dicom in enumerate(input_dicoms):
        statinfo = os.stat(input_dicom)
        DownloadProgressBar.suffix = "{0:.3f}MB".format(
            statinfo.st_size / 10e5)
        progress_indicator.next(1)
        output_dicom, output_log = anonymize_dicomfile(
            input_dicom, outdir, outname=str(cnt), write_log=write_logs)
        dcmfiles.append(output_dicom)
        logfiles.append(output_log)
    progress_indicator.finish()

    return dcmfiles, logfiles
Esempio n. 5
0
def split_series(dicom_dir, outdir):
    """ Split all the folder Dicom files by series in different folders.

    Dicom files are searched recursively in the input folder and all files
    are expected to be Dicom files.

    Expect to split files from a single session.

    Parameters
    ----------
    dicom_dir: str (mandatory)
        a folder containing Dicom files to organize by series.
    outdir: str (mandatory)
        the destination folder.
    """
    # Read the incoming directory:
    # process each file in this directory and its sub-directories
    # expect each file to be a DICOM file
    to_treat_dicom = []
    for root, dirs, files in os.walk(dicom_dir):
        to_treat_dicom.extend([
            os.path.join(root, basename) for basename in files])

    # Go through each file: expected to be in Dicom format
    progress_indicator = DownloadProgressBar(max=len(to_treat_dicom))
    acquisition_datetime = None
    for dicom_file in to_treat_dicom:

        # Update progress bar
        statinfo = os.stat(dicom_file)
        DownloadProgressBar.suffix = "{0:.3f}MB".format(
            statinfo.st_size / 10e5)
        progress_indicator.next(1)

        # Get the time of last modification
        mtime = os.path.getmtime(dicom_file)

        # Read DICOM dataset
        dataset = dicom.read_file(dicom_file)

        # Find character encoding of DICOM attributes:
        # we currently expect encoding to be ISO_IR 100
        if (0x0008, 0x0005) in dataset:
            SpecificCharacterSet = dataset[0x0008, 0x0005].value
            if SpecificCharacterSet != "ISO_IR 100":
                print("'{0}' file encoding is not ISO_IR 100 as "
                      "expected.".format(dicom_file))
                continue
        else:
            print("Can't check encoding of '{0}', missing (0x0008, 0x0005) "
                  "tag.".format(dicom_file))

        # Process other DICOM attributes:
        # decode strings assuming 'ISO_IR 100'
        SeriesDescription = None
        SOPInstanceUID = dataset[0x0008, 0x0018].value
        if (0x0008, 0x103e) in dataset:
            SeriesDescription = cleanup(decode(dataset[0x0008, 0x103e].value))
        SeriesNumber = dataset[0x0020, 0x0011].value
        EchoTime = dataset[0x0018, 0x0081].value

        # Check the session time
        current_acquisition_datetime = (dataset[0x0008, 0x0020].value +
                                        dataset[0x0008, 0x0030].value)
        if acquisition_datetime is None:
            acquisition_datetime = current_acquisition_datetime
        elif acquisition_datetime != current_acquisition_datetime:
            raise ValueError(
                "Two sessions detected in the input folder '{0}': {1} - "
                "{2}.".format(dicom_dir, acquisition_datetime,
                              current_acquisition_datetime))

        # Build the full path to the outgoing directory:
        # we assume that there is only one session
        if SeriesDescription:
            serie_name = (SeriesDescription + "_" + str(EchoTime) + "_" +
                          str(SeriesNumber).rjust(6, "0"))
        else:
            serie_name = str(EchoTime) + "_" + str(SeriesNumber).rjust(6, "0")
        output_dicom_dir = os.path.join(outdir, serie_name)

        # Check that the destination folder exists
        if not os.path.isdir(output_dicom_dir):
            os.mkdir(output_dicom_dir)

        # Build a new name for the DICOM file
        output_dicom_file = os.path.join(output_dicom_dir,
                                         SOPInstanceUID + '.dcm')

        # Copy DICOM file:
        # handle case where outgoing file already exists
        if os.path.exists(output_dicom_file):

            # Compare modification time and keep the most recent file
            if os.path.getmtime(output_dicom_file) < mtime:
                shutil.copy2(dicom_file, output_dicom_file)

        # file does not exists and can be copied
        else:
            shutil.copy2(dicom_file, output_dicom_file)
Esempio n. 6
0
    if each_clazz_num * CLAZZ_NUM < len(conf_rat_list):
        clazz_list[-1] += conf_rat_list[each_clazz_num * CLAZZ_NUM:]

    for clazz_index, clazz in enumerate(clazz_list):
        for j in clazz:
            clazz_dict[list(j.keys())[0]] = clazz_index

    for i in conf_rat_list:
        conf_rat_dict[list(i.keys())[0]] = list(i.values())[0]


conf_rank()
dataset = open(sys.argv[1]).read().split("\n")[1:]
output = open(sys.argv[2], "a")
all_count = len(dataset)
bar = DownloadProgressBar(max=all_count - 1)
value = 0
output.write(get_features("", head=True))
print("正在生成全部特征 ...")
for i in bar.iter(range(all_count - 1)):
    line = dataset[value]
    author_name = line.split(",")[0]
    if author_name != "":
        feature = get_features(author_name)
    output.write(feature)
    value += 1


def check_dimension():
    s = open(sys.argv[2], "r")
    print("正在检查维数")
Esempio n. 7
0
def split_series(dicom_dir, outdir):
    """ Split all the folder Dicom files by series in different folders.

    Dicom files are searched recursively in the input folder and all files
    are expected to be Dicom files.

    Expect to split files from a single session.

    Parameters
    ----------
    dicom_dir: str (mandatory)
        a folder containing Dicom files to organize by series.
    outdir: str (mandatory)
        the destination folder.
    """
    # Read the incoming directory:
    # process each file in this directory and its sub-directories
    # expect each file to be a DICOM file
    to_treat_dicom = []
    for root, dirs, files in os.walk(dicom_dir):
        to_treat_dicom.extend(
            [os.path.join(root, basename) for basename in files])

    # Go through each file: expected to be in Dicom format
    progress_indicator = DownloadProgressBar(max=len(to_treat_dicom))
    acquisition_datetime = None
    for dicom_file in to_treat_dicom:

        # Update progress bar
        statinfo = os.stat(dicom_file)
        DownloadProgressBar.suffix = "{0:.3f}MB".format(statinfo.st_size /
                                                        10e5)
        progress_indicator.next(1)

        # Get the time of last modification
        mtime = os.path.getmtime(dicom_file)

        # Read DICOM dataset
        dataset = dicom.read_file(dicom_file)

        # Find character encoding of DICOM attributes:
        # we currently expect encoding to be ISO_IR 100
        if (0x0008, 0x0005) in dataset:
            SpecificCharacterSet = dataset[0x0008, 0x0005].value
            if SpecificCharacterSet != "ISO_IR 100":
                print("'{0}' file encoding is not ISO_IR 100 as "
                      "expected.".format(dicom_file))
                continue
        else:
            print("Can't check encoding of '{0}', missing (0x0008, 0x0005) "
                  "tag.".format(dicom_file))

        # Process other DICOM attributes:
        # decode strings assuming 'ISO_IR 100'
        SeriesDescription = None
        SOPInstanceUID = dataset[0x0008, 0x0018].value
        if (0x0008, 0x103e) in dataset:
            SeriesDescription = cleanup(decode(dataset[0x0008, 0x103e].value))
        SeriesNumber = dataset[0x0020, 0x0011].value
        EchoTime = dataset[0x0018, 0x0081].value

        # Check the session time
        current_acquisition_datetime = (dataset[0x0008, 0x0020].value +
                                        dataset[0x0008, 0x0030].value)
        if acquisition_datetime is None:
            acquisition_datetime = current_acquisition_datetime
        elif acquisition_datetime != current_acquisition_datetime:
            raise ValueError(
                "Two sessions detected in the input folder '{0}': {1} - "
                "{2}.".format(dicom_dir, acquisition_datetime,
                              current_acquisition_datetime))

        # Build the full path to the outgoing directory:
        # we assume that there is only one session
        if SeriesDescription:
            serie_name = (SeriesDescription + "_" + str(EchoTime) + "_" +
                          str(SeriesNumber).rjust(6, "0"))
        else:
            serie_name = str(EchoTime) + "_" + str(SeriesNumber).rjust(6, "0")
        output_dicom_dir = os.path.join(outdir, serie_name)

        # Check that the destination folder exists
        if not os.path.isdir(output_dicom_dir):
            os.mkdir(output_dicom_dir)

        # Build a new name for the DICOM file
        output_dicom_file = os.path.join(output_dicom_dir,
                                         SOPInstanceUID + '.dcm')

        # Copy DICOM file:
        # handle case where outgoing file already exists
        if os.path.exists(output_dicom_file):

            # Compare modification time and keep the most recent file
            if os.path.getmtime(output_dicom_file) < mtime:
                shutil.copy2(dicom_file, output_dicom_file)

        # file does not exists and can be copied
        else:
            shutil.copy2(dicom_file, output_dicom_file)
Esempio n. 8
0
def anonymize_dicomdir(inputdir, outdir, write_logs=True):
    """ Anonymize all DICOM files of the input directory.

    Parameters
    ----------
    inputdir: str (mandatory)
        A folder that contains only DICOM files to be anonymized.
    outdir: str (mandatory)
        The anonimized DICOM files folder.
    write_logs: bool (optional, default True)
        If True write the anonimization logs.

    Returns
    -------
    dcmfiles: str
        The anonimized DICOM files.
    logfiles: list
        The anonimization log files.

    """
    # Load the first dataset
    input_dicoms = [
        os.path.join(inputdir, fname) for fname in os.listdir(inputdir)
    ]
    dataset = dicom.read_file(input_dicoms[0], force=True)

    # Load the tags to anonymize
    filedir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(filedir, "deidentify.json"), "r") as open_file:
        anon_tags = json.load(open_file)[1:]

    # Set up the desired callbacks and tags to be anonymized
    # Iterate over all the tag to anonymize according to PS 3.15-2008 and
    # supplement 142
    for tag_item in anon_tags:
        tag_repr = tag_item["Tag"][1:-1]
        action = tag_item["Basic Profile"]
        group, element = tag_repr.split(",", 1)

        # Deal with special tags
        if "xx" in group or "xx" in element:
            pattern = re.compile(tag_repr.replace("x", "[0-9A-Fa-f]"))
            CALLBACKS[tag_repr] = [pattern, callback_xxxx]

        # Deal with private tags
        elif "gggg" in group:
            if (0x0008, 0x0070) in dataset:
                MANUFACTURER.append(dataset[0x0008, 0x0070].value)
            if len(MANUFACTURER) > 0:
                CALLBACKS[tag_repr] = [None, callback_private]
            else:
                raise Exception(
                    "The '(0008,0070)' manufacturer tag is not specified and "
                    "is required to anonymize private tags.")

        # Deal with standard tags
        else:
            TAGS[tag_repr] = (int(group, 16), int(element, 16)), action

    # Now compile the diffusion private tags patterns
    filedir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(filedir, "private_deidentify.json"),
              "r") as open_file:
        private_anons = json.load(open_file)
    for key, values in private_anons.items():
        for value in values:
            pattern = re.compile(value["Tag"].replace("x", "[0-9A-Fa-f]"))
            PRIVATE_DEIDENTIFY.setdefault(key, []).append(pattern)

    # Process all DICOM files
    progress_indicator = DownloadProgressBar(max=len(input_dicoms))
    dcmfiles = []
    logfiles = []
    for cnt, input_dicom in enumerate(input_dicoms):
        statinfo = os.stat(input_dicom)
        DownloadProgressBar.suffix = "{0:.3f}MB".format(statinfo.st_size /
                                                        10e5)
        progress_indicator.next(1)
        output_dicom, output_log = anonymize_dicomfile(input_dicom,
                                                       outdir,
                                                       outname=str(cnt),
                                                       write_log=write_logs)
        dcmfiles.append(output_dicom)
        logfiles.append(output_log)
    progress_indicator.finish()

    return dcmfiles, logfiles
Esempio n. 9
0
            rs['author'] = info[2:].split(",")
        elif info[:2] == '#t':
            rs['year'] = int(info[2:])
        elif info[:2] == '#c':
            rs['conf'] = info[2:]
        elif info[:2] == '#%':
            rs['ref'].append(int(info[2:]))
    if rs['year'] == None: print(s)
    if rs['year'] > 2013: return False
    return index, rs


paper_dict = dict()
all_count = len(papers)
bar_count = int(ceil(len(papers) / 1000))
bar = DownloadProgressBar(max=bar_count)
_i = 0
for i in bar.iter(range(bar_count)):
    for j in range(_i * 1000, (_i + 1) * 1000):
        if j == all_count:
            break
        paper = handlePaperInfo(papers[j])
        if paper == False:
            continue
        paper_dict[paper[0]] = paper[1]
    _i += 1
papers = paper_dict

print("正在构建被引用关系")
cnt = 0
for p_index in papers.keys():