def main():
    parser = argparse.ArgumentParser(
        description='dataset generator'
    )
    parser.add_argument(
        '-p', '--possibility',
        type=float,
        default=0.9,
        help='possibility to add train dataset'
    )
    parser.add_argument(
        'source',
        help='path to mecab-processed corpus (xz compressed)'
    )
    parser.add_argument(
        'train',
        help='path for writing training dataset (xz compressed)'
    )
    parser.add_argument(
        'test',
        help='path for writing testing dataset (xz compressed)'
    )
    args = parser.parse_args()
    with lzma.open(args.source, 'rt') as source,\
         lzma.open(args.train, 'wb') as train,\
         lzma.open(args.test, 'wb') as test:
            separate(source, args.possibility, train, test)
Example #2
0
    def _open(self, mode, compress=None):
        self._txnstore.prepare_open(self._filename, mode)

        if compress is None and "compression" in self._options:
            compress = self._options["compression"]
        if compress is not None:
            if compress not in ["lzma", "xz"]:
                raise newfol.exception.FilemanipError("Compression type " +
                                                      compress +
                                                      " not supported")
            if "t" not in mode and "b" not in mode:
                mode += "t"

        if self._isfp:
            fp = self._file
        elif compress == "xz":
            check = -1
            if "r" not in mode:
                check = lzma.CHECK_SHA256
            fp = lzma.open(self._filename, mode, check=check)
        elif compress == "lzma":
            fp = lzma.open(self._filename, mode, format=lzma.FORMAT_ALONE)
        else:
            fp = open(self._filename, mode)

        self._txnstore.commit_open(self._filename, mode)
        return fp
Example #3
0
def open_file(filename, mode, encoding=None):
    import sys, io

    binary = mode.endswith("b")
    mode = mode.rstrip("b") + "b"

    if mode.startswith("r"):
        if filename == "-":
            fileobj = sys.stdin.buffer
        else:
            fileobj = open(filename, mode)

        buf = fileobj.peek(100)

        if buf.startswith(b"\x1f\x8b\x08"):
            import gzip
            fileobj = gzip.open(fileobj, mode)

        elif buf[0:3] == b"BZh" and buf[4:10] == b"1AY&SY":
            import bz2
            fileobj = bz2.open(fileobj, mode)

        elif buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
            import lzma
            fileobj = lzma.open(fileobj, mode)

    else:
        if filename == "-":
            fileobj = sys.stdout.buffer

        elif filename.endswith(".gz"):
            import gzip
            fileobj = gzip.open(filename, mode)

        elif filename.endswith(".bz2"):
            import bz2
            fileobj = bz2.open(filename, mode)

        elif filename.endswith(".xz"):
            import lzma
            fileobj = lzma.open(filename, mode)

        else:
            fileobj = open(filename, mode)

    if binary:
        return fileobj
    else:
        return io.TextIOWrapper(fileobj, encoding=encoding,
                errors="surrogateescape", line_buffering=True)
Example #4
0
def gplus_get_filehandler(write, fname):
    """Reserved for GraphicsPlus internal use"""
    if gplus_options["compress"]:
        if write:
            if os.path.exists(fname):
                return lzma.open(fname, "w")
            else:
                return lzma.open(fname, "x")
        else:
            return lzma.open(fname, "r")
    else:
        if write:
            return open(fname, "w+b")
        else:
            return open(fname, "rb")
Example #5
0
def replace_syslinux_modules(syslinux_version, under_this_dir):
    # Replace modules files extracted from iso with corresponding
    # version provided by multibootusb.
    modules_src_dir = os.path.join(
        multibootusb_host_dir(), "syslinux", "modules", syslinux_version)

    for dirpath, dirnames, filenames in os.walk(under_this_dir):
        for fname in filenames:
            if not fname.lower().endswith('.c32'):
                continue
            dst_path = os.path.join(under_this_dir, dirpath, fname)
            src_path = os.path.join(modules_src_dir, fname)
            if not os.path.exists(src_path):
                log("Suitable replacement of '%s' is not bundled. "
                    "Trying to unlzma." % fname)
                try:
                    with lzma.open(dst_path) as f:
                        expanded = f.read()
                except lzma.LZMAError:
                    continue
                except (OSError, IOError) as e:
                    log("%s while accessing %s." % (e, dst_path))
                    continue
                with open(dst_path, 'wb') as f:
                    f.write(expanded)
                log("Successfully decompressed %s." % fname)
                continue
            try:
                os.remove(dst_path)
                shutil.copy(src_path, dst_path)
                log("Replaced %s module" % fname)
            except (OSError, IOError) as err:
                log(err)
                log("Could not update " + fname)
Example #6
0
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None,
                         compresslevel=9, format=None, check=-1, preset=None, filters=None,
                         compression=None):
    # pylint: disable=unexpected-keyword-arg,no-member
    if compression is None:
        return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
                             newline=newline)
    elif compression in ('gz', 'gzip'):
        if six.PY2:
            return gzip.open(path, mode=mode, compresslevel=compresslevel)
        else:
            return gzip.open(path, mode=mode, compresslevel=compresslevel,
                             errors=errors, newline=newline, encoding=encoding)
    elif compression in ('lzma', 'xz'):
        try:
            import lzma
        except ImportError:
            from backports import lzma
        return lzma.open(path, mode=mode, format=format, check=check, preset=preset,
                         filters=filters, encoding=encoding, errors=errors, newline=newline)
    elif compression == 'bz2':
        if six.PY2 or '__pypy__' in sys.builtin_module_names:
            import bz2file as bz2  # pylint: disable=import-error
        else:
            import bz2

        return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding,
                        errors=errors, newline=newline)
    else:
        raise ValueError(
            'compression must be None, gz, gzip, lzma, or xz and was {0}'.format(compression))
Example #7
0
def writeLog(filename, data):
	try:
		with lzma.open(filename, "w") as f:
			f.write(bytes(data, 'UTF-8'))
	except:
		errstr = "Error: writeLog FAIL"
		raise edce.error.ErrorLog(errstr)		
Example #8
0
def get_uncompressed_stream(input_stream, compression="auto"):
    """
    Returns a file-like object (aka stream) providing an uncompressed
    version of the content read on the input stream provided.

    :param input_stream: The file-like object providing compressed data.
    :param compression: The compression type. Specify "auto" to let the function
        guess it out of the associated filename (the input_stream needs to have
        a name attribute, otherwise a ValueError is raised).
    :type compression: str
    """

    if compression == "auto":  # Try to guess compression method if possible
        if hasattr(input_stream, 'name'):
            compression = guess_compression_method(input_stream.name)
        else:
            raise ValueError("Can't retrieve a name out of %r" % input_stream)

    if compression == "gzip":
        import gzip
        return gzip.open(filename=input_stream, mode="rb")
    elif compression == "bzip2":
        import bz2
        return bz2.open(filename=input_stream, mode="rb")
    elif compression == "xz":
        import lzma
        return lzma.open(filename=input_stream, mode="rb")
    elif compression is None:
        return input_stream
    else:
        raise NotImplementedError(
            "Unknown compression method: %r" % compression)
Example #9
0
    def _load_kanjivg(fname):
        with lzma.open(fname, 'rt', encoding='utf-8') as f:
            tree = ElementTree()
            tree.parse(f)
        
        def parse_kanji(kanji):

            # Converts str('kvg:04e17-g7') to int(7) and str('kvg:04e7e-s11') to int(11)
            indexnum    = lambda s: int(s.rpartition('-')[2][1:])
            ididx       = lambda elem: indexnum(elem.attrib['id'])

            strokes = [ stroke.attrib['d'] for stroke in sorted(kanji.findall('.//path'), key=ididx) ]

            gdata = sorted( (((group.attrib.get('element', group.attrib['id']), int(group.attrib.get('number', '0'))), group)
                        for group in kanji.findall('.//g')),
                    key=itemgetter(0))
            # We use 0-indexed stroke numbers here so you can directly use them as indices to the strokes element.
            groups = [ [ ididx(path)-1
                        for (_elem, __number), group in foo
                        for path in group.findall('.//path') ]
                    for (elem, _number), foo in groupby(gdata, itemgetter(0)) ]

            return KanjiVGEntry(strokes, groups)
        
        # Converts str('kvg:kanji_05726') to str('тюд')
        kvgchr      = lambda s: chr(int(s[len('kvg:kanji_'):].rstrip(string.ascii_letters+string.punctuation), 16))
        return { kvgchr(kanji.attrib['id']): parse_kanji(kanji) for kanji in tree.findall('kanji') }
Example #10
0
def download(addons, extr):
    info = workshopinfo(addons)
    for res in info:
        if not "title" in res:
            print("Addon does not exist!")
            return

        name = res['title']
        download = res['file_url']

        print("Downloading '%s' from the workshop" % name)

        w = Wgety()
        lzmafile = "%s.gma.lzma" % res['publishedfileid']
        outfile = "%s.gma" % res['publishedfileid']
        w.execute(url = download, filename = lzmafile)

        print("Downloaded '%s' from the workshop. Decompressing..." % name)
        with lzma.open(lzmafile) as lzmaF:
            with open(outfile, "wb") as gma:
                gma.write(lzmaF.read())

        os.remove(lzmafile)

        if not extr: return

        name = re.sub('[\\/:"*?<>|]+', '_', name)
        gmafile.extract(outfile, name)
Example #11
0
def open_compressed(filename,mode='rb'):
    """
    Open a file for reading with automatic decompression.  Detects gzip, xz, and
    bz2 files via the file extension.

    Arguments
    ---------
    filename to open

    Returns
    -------
    open file object

    """

    ext = filename.split('.')[-1]

    if ext == 'gz':
        import gzip
        return gzip.open(filename,mode)
    elif ext == 'xz':
        import lzma
        return lzma.open(filename,mode)
    elif ext == 'bz2':
        import bz2
        return bz2.open(filename,mode)
    else:
        return open(filename,mode)
Example #12
0
    def dumpcache(self):
        sizes = [v.size for v in self._db.values()]
        logging.info("Dumping cache for {:,} profiles in {:d} buckets to {!s}".format(
                    sum(sizes), len(self._db), self.bucket_dir()))
        bar = progressbar.ProgressBar(maxval=len(self._db), 
                    widgets=[progressbar.Bar('=', '[', ']'), ' ',
                    progressbar.Percentage()])
        bar.start()
        newdirs = 0
        counts = numpy.zeros(dtype=numpy.uint32, shape=(max(sizes)+1),)
        for (i, (k, v)) in enumerate(self._db.items()):
            path = self.bucket_name(k)
            if not path.parent.is_dir():
                #logging.info("Creating directory {!s}".format(path.parent))
                path.parent.mkdir(parents=True)
                newdirs += 1
#            if v.nbytes > 1e6:
#                logging.debug("Storing {:d}/{:d}, {:,} bytes to {!s}".format(
#                    i, len(self._db), v.nbytes, path))
            counts[v.size] += 1
            with lzma.open(str(path), mode="wb") as fp:
                numpy.save(fp, v)
            bar.update(i+1)
        bar.finish()
        logging.info("Stored cache.  Created {:d} new directories. "
                     "Profiles per bucket histogram: {!s}".format(newdirs, counts))

        self.clearcache()
Example #13
0
def db_iter(path='ucd.xml.xz'):
    with lzma.open(path, 'rb') as f:
        for (_, el) in et.iterparse(f):
            if '}' in el.tag:
                el.tag = el.tag.split('}', 1)[1]
            yield el
            el.clear()
Example #14
0
def handle_savegame(root, file):
  filename = os.path.join(root,file)
  print("Handling savegame: " + filename);
  txt = None;
  with lzma.open(filename,  mode="rt") as f:
    txt = f.read().split("\n");
    status.savegames_read += 1;

  new_filename = "pbem_processed_" + str(random.randint(0,10000000000)) + ".xz";
  f.close();
  shutil.move(filename, os.path.join(root,new_filename))
  print("New filename will be: " + new_filename);
  players = list_players(txt);
  phase = find_phase(txt);
  turn = find_turn(txt);
  game_id = find_game_id(txt);
  print("game_id=" + str(game_id));
  print("phase=" + str(phase));
  print("turn=" + str(turn));
  print("players=" + str(players));

  active_player = players[phase];
  print("active_player=" + active_player);    
  active_email = find_email_address(active_player);
  status.games[game_id] = [turn, phase, players, time.ctime()];
  if (active_email != None):
    print("active email=" + active_email);
    m = MailSender();
    m.send_email(active_player, players, active_email, new_filename.replace(".xz", ""), turn);
    status.emails_sent += 1;
def handle_savegame(filename):
  print("Handling " + filename);
  with lzma.open(filename) as f:
    txt =str(f.read());
    phase = find_phase(txt);
    print(phase);
    print(txt);
Example #16
0
def load(filename):
    with lzma.open(filename, 'rb') as dataset:
        while True:
            try:
                yield pickle.load(dataset)
            except EOFError:
                break
Example #17
0
def download(addons, path, extr):
    info = workshopinfo(addons)
    for res in info:
        if not "title" in res:
            print("Addon does not exist!")
            return

        name = res['title']
        download = res['file_url']

        print("Downloading '%s' from the workshop" % name)

        lzmafile = os.path.join(path, "%s.gma.lzma" % res['publishedfileid'])
        outfile = os.path.join(path, "%s.gma" % res['publishedfileid'])

        urllib.request.urlretrieve(download, lzmafile,
            lambda x, y, z: sys.stdout.write("\r{0:.2f}%".format(x * y / z)))
        sys.stdout.write("\r100.00%\n")

        print("Downloaded '%s' from the workshop. Decompressing..." % name)
        with lzma.open(lzmafile) as lzmaF:
            with open(outfile, "wb") as gma:
                gma.write(lzmaF.read())

        os.remove(lzmafile)

        if not extr: return

        name = os.path.join(path, re.sub('[\\/:"*?<>|]+', '_', name))
        gmafile.extract(outfile, name)
def get_badge_data_and_write_function(
    host, badge_id, filename, require_file=False
):
    filename = host + '-' + filename
    logger.info("Loading {} badges...".format(filename))

    try:
        f = lzma.open('data/' + filename + '.json.xz', 'rt') 
    except FileNotFoundError:
        try:
            f = open('data/' + filename + '.json', 'rt') 
        except FileNotFoundError:
            if not require_file:
                f = None
            else:
                raise

    if f:
        with f:
            badge_data = scraping.BadgeData.from_json(json.load(f))
    else:
        badge_data = scraping.BadgeData(host=host, badge_id=badge_id)

    logger.info("...{} {} badges loaded.".format(len(badge_data), filename))

    def write():
        logger.info("Writing {} {} badges...".format(len(badge_data), filename))
        with lzma.open('data/' + filename + '.json.xz', 'wt') as f:
            json.dump(badge_data.to_json(), f)
        logger.info("...wrote {} {} badges.".format(len(badge_data), filename))

    return badge_data, write
Example #19
0
    def load_tickets(self, candidates, gvt_csv):
        with lzma.open(gvt_csv, 'rt') as fd:
            reader = csv.reader(fd)
            # skip introduction line
            next(reader)
            header = next(reader)
            it = sorted(
                named_tuple_iter('GvtRow', reader, header, PreferenceNo=int, TicketNo=int, OwnerTicket=lambda t: t.strip()),
                key=lambda gvt: (gvt.State, ticket_sort_key(gvt.OwnerTicket), gvt.TicketNo, gvt.PreferenceNo))
            for (state_ab, ticket, ticket_no), g in itertools.groupby(
                    it, lambda gvt: (gvt.State, gvt.OwnerTicket, gvt.TicketNo)):
                if state_ab != self.state_name:
                    continue
                prefs = []
                for ticket_entry in g:
                    candidate = candidates.lookup_name_party(
                        ticket_entry.Surname, ticket_entry.GivenNm, ticket_entry.PartyNm)
                    prefs.append(
                        (ticket_entry.PreferenceNo, candidate.CandidateID))

                non_none = [x for x in prefs if x[0] is not None]
                self.raw_ticket_data.append(sorted(non_none, key=lambda x: x[0]))
                if ticket not in self.gvt:
                    self.gvt[ticket] = []
                self.gvt[ticket].append(PreferenceFlow(tuple(prefs)))
Example #20
0
def xu_open(filename, mode='rb'):
    """
    function to open a file no matter if zipped or not. Files with extension
    '.gz' or '.bz2' are assumed to be compressed and transparently opened to
    read like usual files.

    Parameters
    ----------
     filename:  filename of the file to open (full including path)
     mode:      mode in which the file should be opened

    Returns
    -------
     file handle of the opened file

    If the file does not exist an IOError is raised by the open routine, which
    is not caught within the function
    """

    if filename.endswith('.gz'):
        fid = gzip.open(filename, mode)
    elif filename.endswith('.bz2'):
        fid = bz2.BZ2File(filename, mode)
    elif filename.endswith('.xz'):
        if sys.version_info >= (3, 3):
            fid = lzma.open(filename, mode)
        else:
            raise TypeError("File compression type not supported in Python "
                            "versions prior to 3.3")
    else:
        fid = open(filename, mode)

    return fid
Example #21
0
    def retrieve_model(self):
        os.makedirs('models', exist_ok=True)

        file_name = f'{self.name()}model'  # noqa: E999
        file_path = os.path.join('models', file_name)

        model_url = f'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/{file_name}.xz'  # noqa
        r = requests.head(model_url, allow_redirects=True)
        new_etag = r.headers['ETag']

        try:
            with open(f'{file_path}.etag', 'r') as f:  # noqa
                old_etag = f.read()
        except IOError:
            old_etag = None

        if old_etag != new_etag:
            try:
                urlretrieve(model_url, f'{file_path}.xz')
            except HTTPError:
                logger.exception('Tool {}'.format(self.name()))
                return file_path

            with lzma.open(f'{file_path}.xz', 'rb') as input_f:  # noqa
                with open(file_path, 'wb') as output_f:
                    shutil.copyfileobj(input_f, output_f)

            with open(f'{file_path}.etag', 'w') as f:  # noqa
                f.write(new_etag)

        return file_path
Example #22
0
def retrieve_model(name):
    os.makedirs(MODELS_DIR, exist_ok=True)

    file_name = f"{name}model"
    file_path = os.path.join(MODELS_DIR, file_name)

    base_model_url = BASE_URL.format(name)
    model_url = f"{base_model_url}/{file_name}.xz"
    LOGGER.info(f"Checking ETAG of {model_url}")
    r = requests.head(model_url, allow_redirects=True)
    r.raise_for_status()
    new_etag = r.headers["ETag"]

    try:
        with open(f"{file_path}.etag", "r") as f:
            old_etag = f.read()
    except IOError:
        old_etag = None

    if old_etag != new_etag:
        LOGGER.info(f"Downloading the model from {model_url}")
        urlretrieve(model_url, f"{file_path}.xz")

        with lzma.open(f"{file_path}.xz", "rb") as input_f:
            with open(file_path, "wb") as output_f:
                shutil.copyfileobj(input_f, output_f)
                LOGGER.info(f"Written model in {file_path}")

        with open(f"{file_path}.etag", "w") as f:
            f.write(new_etag)
    else:
        LOGGER.info(f"ETAG for {model_url} is ok")

    return file_path
Example #23
0
def handle_savegame(root, file):
    time.sleep(1)
    filename = os.path.join(root, file)
    print("Handling savegame: " + filename)
    txt = None
    with lzma.open(filename, mode="rt") as f:
        txt = f.read().split("\n")
        status.savegames_read += 1

    new_filename = "pbem_processed_" + str(random.randint(0, 10000000000)) + ".xz"
    f.close()
    shutil.move(filename, os.path.join(root, new_filename))
    print("New filename will be: " + new_filename)
    players = list_players(txt)
    phase = find_phase(txt)
    turn = find_turn(txt)
    game_id = find_game_id(txt)
    state = find_state(txt)
    print("game_id=" + str(game_id))
    print("phase=" + str(phase))
    print("turn=" + str(turn))
    print("state=" + str(state))
    print("players=" + str(players))

    active_player = players[phase]
    print("active_player=" + active_player)
    active_email = find_email_address(active_player)
    status.games[game_id] = [turn, phase, players, time.ctime(), int(time.time()), state]
    if active_email != None:
        print("active email=" + active_email)
        m = MailSender()
        m.send_email(active_player, players, active_email, new_filename.replace(".xz", ""), turn)
        status.emails_sent += 1
Example #24
0
def parse_candle(bi5, date, point=5):
    quote = {}
    if type(date) == str:
        date = dateutil.parser.parse(date).replace(
            tzinfo=pytz.utc, hour=0, minute=0, second=0, microsecond=0)
    s = struct.Struct('>L')
    try:
        with lzma.open(bi5[0]) as f:
            content = f.read()
    except EOFError:
        print('{}: File is not valid lzma file. Conitnue'.format(date))
        return quote
    size = len(content)
    idx = 0
    while idx < size:
        time_delta = s.unpack(content[idx:idx + 4])[0]
        price_open = s.unpack(content[idx + 4:idx + 8])[0] / 10 ** point
        price_high = s.unpack(content[idx + 8:idx + 12])[0] / 10 ** point
        price_low = s.unpack(content[idx + 12:idx + 16])[0] / 10 ** point
        price_close = s.unpack(content[idx + 16:idx + 20])[0] / 10 ** point
        volume = s.unpack(content[idx + 20:idx + 24])[0]
        last_candle = date.astimezone(pytz.utc) + timedelta(seconds=time_delta)
        try:
            quote[last_candle]
        except KeyError:
            quote[last_candle] = {}
        finally:
            quote[last_candle] = {'open': price_open, 'high': price_high,
                                  'low': price_low, 'close': price_close, 'vol': volume}
        idx += 24
    return quote
Example #25
0
    def finish(self, private_key):
        # Create package index.
        def write_entry(f, package, version):
            f.write(self._get_control_snippet(package, version))
            filename = self._get_filename(package, version)
            path = os.path.join(self._new_path, filename)
            f.write(
                'Filename: %s\n'
                'Size: %u\n'
                'SHA256: %s\n' % (
                    filename,
                    os.path.getsize(path),
                    util.sha256(path).hexdigest(),
                ))

            f.write('\n')
        index = os.path.join(self._new_path, 'Packages')
        with open(index, 'wt') as f, lzma.open(index + '.xz', 'wt') as f_xz:
            for package, version in self._packages:
                write_entry(f, package, version)
                write_entry(f_xz, package, version)

        # Link the index into the per-architecture directory.
        for arch in self._architectures:
            index_arch = os.path.join(
                self._new_path,
                'dists/cloudabi/cloudabi/binary-%s/Packages' % arch)
            util.make_parent_dir(index_arch)
            os.link(index, index_arch)
            os.link(index + '.xz', index_arch + '.xz')
        checksum = util.sha256(index).hexdigest()
        checksum_xz = util.sha256(index + '.xz').hexdigest()
        size = os.path.getsize(index)
        size_xz = os.path.getsize(index + '.xz')
        os.unlink(index)
        os.unlink(index + '.xz')

        # Create the InRelease file.
        with open(
            os.path.join(self._new_path, 'dists/cloudabi/InRelease'), 'w'
        ) as f, subprocess.Popen([
            'gpg', '--local-user', private_key, '--armor',
            '--sign', '--clearsign', '--digest-algo', 'SHA256',
        ], stdin=subprocess.PIPE, stdout=f) as proc:
            def append(text):
                proc.stdin.write(bytes(text, encoding='ASCII'))
            append(
                'Suite: cloudabi\n'
                'Components: cloudabi\n'
                'Architectures: %s\n'
                'Date: %s\n'
                'SHA256:\n' % (
                ' '.join(sorted(self._architectures)),
                time.strftime("%a, %d %b %Y %H:%M:%S UTC", time.gmtime())))
            for arch in sorted(self._architectures):
                append(' %s %d cloudabi/binary-%s/Packages\n' %
                       (checksum, size, arch))
                append(' %s %d cloudabi/binary-%s/Packages.xz\n' %
                       (checksum_xz, size_xz, arch))
Example #26
0
def open_lzma(file, *, mode, encoding=None, errors=None, newline=None,
              external=False, parallel=False):
    if external and EXTERNAL_XZ:
        args = [EXTERNAL_XZ, '-c', '-d']
        return ProcessIOReader(args, file, mode, encoding, errors, newline)

    return lzma.open(
        file, mode=mode, encoding=encoding, errors=errors, newline=newline)
Example #27
0
def _recompress_to_gz(xz, gz):
    import lzma
    import gzip
    with lzma.open(xz) as xzf, gzip.open(gz, mode='xb') as gzf:
        while True:
            block = xzf.read(1024 * 1024)
            if not block: break
            gzf.write(block)
Example #28
0
def rebuild(rev):
    if rev is None:
        return []
    c = get_db().execute("SELECT parent, patch FROM blobs WHERE id = ?", (sqlite3.Binary(rev),))
    parent, patch_id = c.fetchone()
    with lzma.open(os.path.join(app.config["BASE_PATH"], binascii.hexlify(patch_id).decode("utf-8")), "rb") as f:
        patch_ = f.read().splitlines(True)
    return patch(rebuild(parent), patch_)
Example #29
0
def load_food_desc(data_dir):
    fn = os.path.join(data_dir, 'food_desc.xz')
    with lzma.open(fn, 'rt', encoding='utf8') as inputfile:
        for l in inputfile:
            a = l.strip("\n\r").split('|')
            food_dict[a[0]] = ((a[1], a[2]))
            food_list_per_group[a[1]].append((a[0], a[2]))
            food_list.append(tuple(a))
Example #30
0
def raw2np(filename, shape):
    if filename[-3:] == ".xz":
        sys.stdout.write("Decompressing data...\n")
        sys.stdout.flush()
        with lzma.open(filename) as decompf:
            return np.fromstring(
                decompf.read(), dtype=np.uint16).reshape(shape)
    else:
        return np.fromfile(filename, dtype=np.uint16).reshape(shape)
def load_icom_stream(icom_path):
    with lzma.open(icom_path, "r") as f:
        contents = f.read()

    return contents
Example #32
0
def _index_file(pp: Path, opts: Options) -> Results:
    logger = get_logger()
    # TODO use kompress?
    # TODO not even sure if it's used...
    suf = pp.suffix.lower()

    if suf == '.xz': # TODO zstd?
        import lzma
        uname = pp.name[:-len('.xz')]
        uncomp = Path(get_tmpdir().name) / uname
        with lzma.open(pp, 'rb') as cf:
            with uncomp.open('wb') as fb:
                fb.write(cf.read())
        yield from _index(path=uncomp, opts=opts)
        return

    # TODO dispatch org mode here?
    # TODO try/catch?

    if suf not in SMAP:
        pm = mime(pp)
        if pm not in SMAP:
            yield RuntimeError(f"Unexpected file extension: {pp}, {pm}")
            return
        else:
            ip = SMAP.get(pm, None)
        # TODO assume plaintext?
    else:
        ip = SMAP.get(suf, None)
    if ip is None:
        # TODO only log once?
        logger.debug('file type suppressed: %s', pp)
        return

    indexer: Union[Urls, Results] = ip(pp) # type: ignore
    # TODO careful, filter out obviously not plaintext? maybe mime could help here??

    root = opts.root
    fallback_dt = datetime.fromtimestamp(pp.stat().st_mtime, tz=pytz.utc)
    fallback_loc = Loc.file(pp)
    replacer = opts.replacer
    for r in indexer:
        if isinstance(r, Exception):
            yield r
            continue
        if isinstance(r, EUrl):
            v = Visit(
                url=r.url,
                dt=fallback_dt,
                locator=fallback_loc,
                context='::'.join(r.ctx),
            )
        else:
            v = r

        loc = v.locator
        if loc is not None and root is not None:
            # meh. but it works
            # todo potentially, just use dataclasses instead...
            loc = loc._replace(title=loc.title.replace(str(root) + '/', ''))
            v = v._replace(locator=loc)

        if replacer is not None:
            upd: Dict[str, Any] = {}
            href = v.locator.href
            if href is not None:
                upd['locator'] = v.locator._replace(href=replacer(href), title=replacer(v.locator.title))
            ctx = v.context
            if ctx is not None:
                # TODO in context, http is unnecessary
                upd['context'] = replacer(ctx)
            v = v._replace(**upd)
        yield v
Example #33
0
def read_file(filepath):
    assert os.access(filepath, os.R_OK)

    with lzma.open(filepath, 'rt', encoding='utf-8') as hpx_output_handle:
        hpx_output = hpx_output_handle.read()
    return hpx_output
Example #34
0
 def __init__(self, word_list=None):
     if word_list is None:
         word_list = 'eff-long'
     with lzma.open(wordlist_path('{}.txt.xz'.format(word_list))) as f:
         self.wordlist = f.read().decode().strip().split('\n')
Example #35
0
def cached_path(
    url_or_filename,
    download_config=None,
    **download_kwargs,
) -> Optional[str]:
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
    return the path to the cached file. If it's already a local path,
    make sure the file exists and then return the path.

    Return:
        Local path (string)

    Raises:
        FileNotFoundError: in case of non-recoverable file
            (non-existent or no cache on disk)
        ConnectionError: in case of unreachable url
            and no cache on disk
        ValueError: if it couldn't parse the url or filename correctly
        requests.exceptions.ConnectionError: in case of internet connection issue
    """
    if download_config is None:
        download_config = DownloadConfig(**download_kwargs)

    cache_dir = download_config.cache_dir or config.HF_DATASETS_CACHE
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    if isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)

    if is_remote_url(url_or_filename):
        # URL, so get it from the cache (downloading if necessary)
        output_path = get_from_cache(
            url_or_filename,
            cache_dir=cache_dir,
            force_download=download_config.force_download,
            proxies=download_config.proxies,
            resume_download=download_config.resume_download,
            user_agent=download_config.user_agent,
            local_files_only=download_config.local_files_only,
            use_etag=download_config.use_etag,
            max_retries=download_config.max_retries,
        )
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        output_path = url_or_filename
    elif urlparse(url_or_filename).scheme == "" or os.path.ismount(urlparse(url_or_filename).scheme + ":/"):
        # File, but it doesn't exist.
        # On unix the scheme of a local path is empty, while on windows the scheme is the drive name (ex: "c")
        # for details on the windows behavior, see https://bugs.python.org/issue42215
        raise FileNotFoundError("Local file {} doesn't exist".format(url_or_filename))
    else:
        # Something unknown
        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))

    if download_config.extract_compressed_file and output_path is not None:

        if (
            not is_zipfile(output_path)
            and not tarfile.is_tarfile(output_path)
            and not is_gzip(output_path)
            and not is_xz(output_path)
            and not is_rarfile(output_path)
        ):
            return output_path

        # Path where we extract compressed archives
        # We extract in the cache dir, and get the extracted path name by hashing the original path"
        abs_output_path = os.path.abspath(output_path)
        output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path))

        if (
            os.path.isdir(output_path_extracted)
            and os.listdir(output_path_extracted)
            and not download_config.force_extract
        ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract):
            return output_path_extracted

        # Prevent parallel extractions
        lock_path = output_path + ".lock"
        with FileLock(lock_path):
            shutil.rmtree(output_path_extracted, ignore_errors=True)
            os.makedirs(output_path_extracted, exist_ok=True)
            if tarfile.is_tarfile(output_path):
                tar_file = tarfile.open(output_path)
                tar_file.extractall(output_path_extracted)
                tar_file.close()
            elif is_gzip(output_path):
                os.rmdir(output_path_extracted)
                with gzip.open(output_path, "rb") as gzip_file:
                    with open(output_path_extracted, "wb") as extracted_file:
                        shutil.copyfileobj(gzip_file, extracted_file)
            elif is_zipfile(output_path):  # put zip file to the last, b/c it is possible wrongly detected as zip
                with ZipFile(output_path, "r") as zip_file:
                    zip_file.extractall(output_path_extracted)
                    zip_file.close()
            elif is_xz(output_path):
                os.rmdir(output_path_extracted)
                with lzma.open(output_path) as compressed_file:
                    with open(output_path_extracted, "wb") as extracted_file:
                        shutil.copyfileobj(compressed_file, extracted_file)
            elif is_rarfile(output_path):
                if config.RARFILE_AVAILABLE:
                    import rarfile

                    rf = rarfile.RarFile(output_path)
                    rf.extractall(output_path_extracted)
                    rf.close()
                else:
                    raise EnvironmentError("Please pip install rarfile")
            else:
                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))

        return output_path_extracted

    return output_path
Example #36
0
def smart_open(filename, *args, **kwargs):
    if filename.endswith('.xz'):
        return lzma.open(filename, *args, **kwargs)
    else:
        return open(filename, *args, **kwargs)
Example #37
0
		if arch_pkg:
			arch_pkg.add_deb(info)

	#	print(convertPackage(info, package_names + optional_names))


# get list of unique arch packages from package map
arch_package_names=list(arch_packages.keys())
arch_package_names.sort()
deb_package_names=[]

print(header_tpl.format(
	package_names="(" + " ".join( arch_package_names ) + ")",
	pkgver=pkgver,
	pkgrel=pkgrel,
	dlagents=dlagents,
	source="\n\t".join(sources),
	sha256sums="\n\t".join(sha256sums)
))

print(package_functions)


with lzma.open(source_file, "r") as tar:
	with tarfile.open(fileobj=tar) as tf:
		with tf.extractfile("amdgpu-pro-%s-%s/Packages" %(pkgver_base,pkgver_build)) as packages:
			writePackages(packages)

for pkg in arch_package_names:
	print( arch_packages[pkg].toPKGBUILD() )
            ('bc', BaggingClassifier(verbose=True))
                ]

    model = StackingClassifier(estimators=estimators, final_estimator=GBC(n_estimators=500, max_depth=20, verbose=True))
    model.fit(train_data, train_target)

    print(f"Train set score: {accuracy_score(model.predict(train_data), train_target)}")
    print(f"Train set score: {accuracy_score(model.predict(test_data), test_target)}")

    # TODO: The trained model needs to be saved. All sklearn models can
    # be serialized and deserialized using the standard `pickle` module.
    # Additionally, we also compress the model.
    #
    # To save a model, open a target file for binary access, and use
    # `pickle.dump` to save the model to the opened file:
    with lzma.open(args.model_path, "wb") as model_file:
        pickle.dump(model, model_file)

# The `recodex_predict` is called during ReCodEx evaluation (there can be
# several Python sources in the submission, but exactly one should contain
# a `recodex_predict` method).
def recodex_predict(data):
    # The `data` is a pandas.DataFrame containt test set input.

    args = parser.parse_args([])

    # TODO: Predict target values for the given data.
    #
    # You should probably start by loading a model. Start by opening the model
    # file for binary read access and then use `pickle.load` to deserialize the
    # model from the stored binary data:
Example #39
0
def fixture_fileh(filename):
    return lzma.open(os.path.join(dirname, "../data", filename), "r")
Example #40
0
def main():
    print("<div style=\"border:1px solid black;\">", end="\n\n")
    print("`{bm-disable-all}`", end="\n\n")
    try:
        # *** THIS BLOCK LOADS IN THE SEQUENCE AND GENES FOR BAKER'S YEAST ***
        fasta_filepath = 'GCA_000146045.2_R64_genomic.fna.xz'
        with lzma.open(fasta_filepath, mode='rt', encoding='utf-8') as f:
            lines = f.read().splitlines()
            lines = [line for line in lines if not line.startswith('>')]
            seq = ''.join(lines).upper()

        genes_filepath = 'GCA_000146045.2_R64_gene_result.txt.xz'
        with lzma.open(genes_filepath, mode='rt', encoding='utf-8') as f:
            csv_reader = csv.reader(f, delimiter='\t')
            genes_data = []
            for row in csv_reader:
                genes_data.append(row)

        genes = []
        for row in genes_data[1:]:
            gene_start_pos_str = row[12].strip()
            if gene_start_pos_str == '':
                continue
            gene_name = row[5].strip()
            gene_start_pos = int(gene_start_pos_str)
            genes.append((gene_name, gene_start_pos))
        genes.sort(key=lambda g: g[1])

        # This is an artificial example. I went through yeast motifs in http://motifmap.ics.uci.edu/ and picked one out
        # (DIG1). Then I went through NCBI and tried to find the genome and gene list for this particular strain of
        # yeast. I searched through the sequence and known gene locations to pull out gene upstream regions that
        # contained this motif.
        #
        # This is the closest I could get to a practical example short of doing my own experiments, which I don't have
        # the equipment or wherewithal to do.
        #
        # Motifs: http://motifmap.ics.uci.edu/ (Click on motif search and select yeast -- it will display all motifs)
        # Sequence: https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2
        # Gene list: https://www.ncbi.nlm.nih.gov/gene/?term=txid559292%5BOrganism%3Anoexp%5D+DIG1
        #
        # To speed the example up, I only used a subset of the found gene upstreams.

        # *** THIS BLOCK SEARCHES THE SEQUENCE FOR MOTIF MEMBERS OF DIG1 ***
        # search_locs = []
        # for m in re.finditer(r'AAA..[AG]AA.GA[AG][AG]AA.A[AG]', seq):  # This is the motif for DIG1
        #     start_idx, end_idx = m.span()
        #     motif_member = m.string[start_idx:end_idx]
        #     closest_gene =\
        #         min(
        #             map(
        #                 lambda g: (g[0], g[1] - start_idx, g[1]),
        #                 filter(
        #                     lambda g: g[1] > start_idx,
        #                     genes
        #                 )
        #             ),
        #             key=lambda g: g[1],
        #             default=None
        #         )
        #     if closest_gene is not None and closest_gene[1] <= 2000:
        #         print(f'Found {motif_member} {start_idx}, closest gene is {closest_gene}')
        #         search_locs.append((closest_gene[2] - 2000, closest_gene[2]))
        # for start_idx, end_idx in search_locs:
        #     print(f'seq[{start_idx}:{end_idx}]')
        # # Found AAAAGGAAGGAAAAATAG 14779, closest gene is ('THI12', 53, 14832)
        # # Found AAACAAAAAGAAAAAAAG 65682, closest gene is ('TOS6', 62, 65744)
        # # Found AAAAGAAAAGAGAAATAG 67732, closest gene is ('snR85', 36, 67768)
        # # Found AAAAAAAAGGAAAAAAAG 70180, closest gene is ('YHL017W', 96, 70276)
        # # Found AAAGAAAAAGAAAAAAAA 128183, closest gene is ('SYN8', 69, 128252)
        # # Found AAAAGAAAAGAAAAAAAG 172014, closest gene is ('YPL199C', 19, 172033)
        # # Found AAACGGAATGAGGAATAA 183306, closest gene is ('RPC53', 37, 183343)
        # # Found AAAAAAAACGAAAAAAAA 268978, closest gene is ('CDC5', 41, 269019)
        # # Found AAAAAAAAGGAAAAAGAA 293881, closest gene is ('YBR027C', 143, 294024)
        # # Found AAAGAAAAAGAAAAAGAA 404722, closest gene is ('YCK3', 91, 404813)
        # # Found AAACGGAATGAGGAATAA 451419, closest gene is ('MEH1', 15, 451434)
        # # Found AAACGGAATGAGGAATAA 457003, closest gene is ('KTR6', 115, 457118)
        # # Found AAAAAAAACGAGAAAAAG 488333, closest gene is ('MSK1', 53, 488386)
        # # Found AAACGGAATGAGGAATAA 489960, closest gene is ('VPS21', 236, 490196)
        # # Found AAACGGAATGAGGAATAA 495545, closest gene is ('SHE3', 47, 495592)
        # # Found AAACGGAATGAGGAATAA 557448, closest gene is ('TIF34', 33, 557481)
        # # Found AAAAAAAATGAAAAACAA 590680, closest gene is ('GRR1', 192, 590872)
        # # Found AAACGAAACGAAGAAAAA 645845, closest gene is ('YDR098C-B', 13, 645858)
        # # Found AAAGAAAAAGAGAAATAA 760834, closest gene is ('BSC5', 289, 761123)
        # # Found AAAAAGAAAGAAAAAAAG 779839, closest gene is ('IRC13', 31, 779870)
        # # Found AAAGCAAAAGAAGAAAAA 780606, closest gene is ('DFR1', 300, 780906)
        # # Found AAAACAAACGAAAAAAAA 783720, closest gene is ('ECL1', 503, 784223)
        # # Found AAAGAAAATGAAAAAAAA 791430, closest gene is ('STB3', 918, 792348)
        # # Found AAACGAAAGGAGAAATAA 873685, closest gene is ('FBP1', 61, 873746)
        # # Found AAAGAAAAGGAAAAAAAG 1116394, closest gene is ('YCG1', 732, 1117126)
        # # Found AAACGGAATGAGGAATAA 1126271, closest gene is ('UBX5', 1601, 1127872)
        # # Found AAACGGAATGAGGAATAA 1195086, closest gene is ('BCP1', 326, 1195412)
        # # Found AAACAAAAAGAAAAACAA 1195582, closest gene is ('TFC6', 1097, 1196679)
        # # Found AAACGGAATGAGGAATAA 1212779, closest gene is ('KEI1', 69, 1212848)

        # *** THIS BLOCK RUNS MOTIF FINDING ALGO ON THE GENE UPSTREAM REGIONS -- FOUND MOTIF SHOULD BE FOR DIG1 ***
        #  note: some gene upstream regions were commented out to speed up motif finding
        gene_upstreams = [
            seq[12832:14832],  # THI12
            # seq[63744:65744],
            # seq[65768:67768],
            seq[68276:70276],  # YHL017W
            seq[126252:128252],  # SYN8
            # seq[170033:172033],
            # seq[181343:183343],
            # seq[267019:269019],
            # seq[292024:294024],
            # seq[402813:404813],
            # seq[449434:451434],
            # seq[455118:457118],
            # seq[486386:488386],
            # seq[488196:490196],
            # seq[493592:495592],
            # seq[555481:557481],
            # seq[588872:590872],
            # seq[643858:645858],
            # seq[759123:761123],
            # seq[777870:779870],
            # seq[778906:780906],
            # seq[782223:784223],
            # seq[790348:792348],
            # seq[871746:873746],
            seq[1115126:1117126],  # YCG1
            seq[1125872:1127872],  # UBX5
            # seq[1193412:1195412],
            # seq[1194679:1196679],
            seq[1210848:1212848]  # KEI1
        ]
        k = 18  # If we searched for a slightly larger or smaller k, we would likely get some hits that contain parts of
        # the correct motif members (k=18). I think the correct course of action is to play with k and see what
        # parts of the upstream regions light up. If they're consistently lighting up within the same parts, you
        # may be on the right track?
        #
        # Since this is an artificial example, we already know that k=18.
        print(
            f'Organism is baker\'s yeast. Suspected genes influenced by transcription factor: THI12, YHL017W, SYN8,'
            f' YCG1, UBX5, and KEI1.',
            end="\n\n")
        print(
            f'Searching for {k}-mer across a set of {len(gene_upstreams)} gene upstream regions...',
            end="\n\n")
        best_motif_matrix = None
        for iteration in range(200):
            found_motif_matrix = randomized_motif_search_with_psuedocounts(
                k, gene_upstreams)
            if best_motif_matrix is None or score_motif(
                    found_motif_matrix) < score_motif(best_motif_matrix):
                best_motif_matrix = found_motif_matrix
        print(f'{"<br>".join(best_motif_matrix)}', end="\n\n")
        print(f'Score is: {score_motif(best_motif_matrix)}', end="\n\n")
    finally:
        print("</div>", end="\n\n")
        print("`{bm-enable-all}`", end="\n\n")
Example #41
0
    'required': ['id', 'description', 'find', 'drs'],
}

jsonschema.validate(config, schema)

drs_re = re.compile(config["drs"], re.VERBOSE)

find_command = [
    "/bin/find",
    *config["find"]["paths"],
    *shlex.split(config["find"].get("options", "")),
]
print(shlex.join(find_command))

with tempfile.TemporaryFile('w+') as f, tempfile.TemporaryFile(
        'w+') as s, lzma.open("catalogue.csv.xz", mode="wt",
                              newline="") as out, lzma.open('errors.xz',
                                                            mode='wt') as e:

    # Find files
    print("Finding Files...")
    find = subprocess.run(find_command, stdout=f)
    find.check_returncode()
    f.seek(0)

    # Sort the results
    print("Sorting Files...")
    sort = subprocess.run(["/bin/sort"], stdin=f, stdout=s)
    sort.check_returncode()
    s.seek(0)

    # Get the column names
Example #42
0
def xopen(filename, mode='r'):
    """
	Replacement for the "open" function that can also open files that have
	been compressed with gzip or bzip2. If the filename is '-', standard
	output (mode 'w') or input (mode 'r') is returned. If the filename ends
	with .gz, the file is opened with a pipe to the gzip program. If that
	does not work, then gzip.open() is used (the gzip module is slower than
	the pipe to the gzip program). If the filename ends with .bz2, it's
	opened as a bz2.BZ2File. Otherwise, the regular open() is used.

	mode can be: 'rt', 'rb', 'a', 'wt', or 'wb'
	Instead of 'rt' and 'wt', 'r' and 'w' can be used as abbreviations.

	In Python 2, the 't' and 'b' characters are ignored.

	Append mode ('a') is unavailable with BZ2 compression and will raise an error.
	"""
    if mode == 'r':
        mode = 'rt'
    elif mode == 'w':
        mode = 'wt'
    if mode not in ('rt', 'rb', 'wt', 'wb', 'a'):
        raise ValueError("mode '{0}' not supported".format(mode))
    if not PY3:
        mode = mode[0]
    if not isinstance(filename, basestring):
        raise ValueError("the filename must be a string")

    # standard input and standard output handling
    if filename == '-':
        if not PY3:
            return sys.stdin if 'r' in mode else sys.stdout
        return dict(rt=sys.stdin,
                    wt=sys.stdout,
                    rb=sys.stdin.buffer,
                    wb=sys.stdout.buffer)[mode]

    if filename.endswith('.bz2'):
        if bz2 is None:
            raise ImportError(
                "Cannot open bz2 files: The bz2 module is not available")
        if PY3:
            if 't' in mode:
                return io.TextIOWrapper(bz2.BZ2File(filename, mode[0]))
            else:
                return bz2.BZ2File(filename, mode)
        else:
            return bz2.BZ2File(filename, mode)
    elif filename.endswith('.xz'):
        if lzma is None:
            raise ImportError(
                "Cannot open xz files: The lzma module is not available (use Python 3.3 or newer)"
            )
        return lzma.open(filename, mode)
    elif filename.endswith('.gz'):
        if PY3:
            if 't' in mode:
                return io.TextIOWrapper(gzip.open(filename, mode[0]))
            else:
                if 'r' in mode:
                    return io.BufferedReader(gzip.open(filename, mode))
                else:
                    return io.BufferedWriter(gzip.open(filename, mode))
        else:
            # rb/rt are equivalent in Py2
            if 'r' in mode:
                try:
                    return GzipReader(filename)
                except IOError:
                    # gzip not installed
                    return buffered_reader(gzip.open(filename, mode))
            else:
                try:
                    return GzipWriter(filename, mode)
                except IOError:
                    return buffered_writer(gzip.open(filename, mode))
    else:
        return open(filename, mode)
Example #43
0
 def get_stored_def(self, rid):
     with lzma.open(self.run_dir / str(rid) / STORED_DEF_NAME) as codefile:
         return codefile.read().decode('utf-8')
Example #44
0
import lzma

with open('X86_hello', 'r') as f:
	data=f.read().encode('utf-8') 
	with lzma.open("X86_hello.xz", "w") as fd:
		fd.write(data)
Example #45
0
import lzma

from FindMaximalNonBranchingPaths import find_maximal_non_branching_paths
from Kdmer import Kdmer
from Read import Read
from ReadPair import ReadPair
from ToDeBruijnGraph import to_debruijn_graph

reads_filepath = 'FinalChallengeReads.txt.xz'
with lzma.open(reads_filepath, mode='rt', encoding='utf-8') as f:
    lines = f.read().splitlines()
    lines = [l.strip() for l in lines]  # get rid of whitespace
    lines = [l for l in lines if len(l) > 0]  # get rid of empty lines

lines_split = [tuple(l.split('|', maxsplit=2)) for l in lines]
kdmers = [Kdmer(k1, k2, 1000) for k1, k2 in lines_split]
rps = [ReadPair(kdmer) for kdmer in kdmers]
broken_rps = [broken_rp for rp in rps for broken_rp in rp.shatter(40)]

broken_rps = list(set(broken_rps))

graph = to_debruijn_graph(broken_rps)
contig_paths = find_maximal_non_branching_paths(graph)

contig_paths.sort(key=lambda x: len(x))

for path in contig_paths:
    if len(path) >= path[0].d:
        out = path[0].stitch(path)
        print(f'{len(path)} kd-mers = {out}')
    else:
Example #46
0
    default=False)
oparser.add_argument('--cleanhtml',
                     action='store_true',
                     help='Clean HTML to remove javascript, css and head tags',
                     default=False)
options = oparser.parse_args()

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                    level=logging.INFO if options.verbose else logging.ERROR,
                    datefmt='%Y-%m-%d %H:%M:%S')

f = None
fo = None

if options.input[-3:] == ".xz":
    f = ArchiveIterator(lzma.open(options.input, 'r'))
elif options.input[-3:] == ".gz":
    f = ArchiveIterator(open(options.input, 'rb'))
elif options.input == sys.stdin:
    f = ArchiveIterator(options.input.buffer)
else:
    f = ArchiveIterator(open(options.input, 'rb'))

if options.output == sys.stdout:
    fo = WARCWriter(options.output.buffer, gzip=True)
else:
    fo = WARCWriter(open(options.output, 'wb'), gzip=True)

if options.pdfextract:
    extractor = ExtrP()
Example #47
0
 if filename.endswith('.gz'):
     self.fd = gzip.open(filename, 'rb')
     try:
         # read a bit to make sure it's a gzip file
         self.fd.read(10)
         self.fd.seek(0, 0)
     except Exception, e:
         print >> log, "[EPGImport] File downloaded is not a valid gzip file", filename
         self.downloadFail(e)
         return
 elif filename.endswith('.xz') or filename.endswith('.lzma'):
     try:
         import lzma
     except ImportError:
         from backports import lzma
     self.fd = lzma.open(filename, 'rb')
     try:
         # read a bit to make sure it's an xz file
         self.fd.read(10)
         self.fd.seek(0, 0)
     except Exception, e:
         print >> log, "[EPGImport] File downloaded is not a valid xz file", filename
         self.downloadFail(e)
         return
 else:
     self.fd = open(filename, 'rb')
 if deleteFile and self.source.parser != 'epg.dat':
     try:
         print >> log, "[EPGImport] unlink", filename
         os.unlink(filename)
     except Exception, e:
Example #48
0
with open('data/cases.csv', mode='w') as csvfile:
    fieldnames = [
        'id', 'url', 'name', 'name_abbreviation', 'body', 'decision_date',
        'decision_year', 'decision_month', 'docket_number', 'first_page',
        'last_page', 'frontend_url', 'citations_count', 'citation_0_type',
        'citation_0', 'citation_1_type', 'citation_1', 'citation_2_type',
        'citation_2', 'volume_barcode', 'volume_number', 'reporter_name',
        "reporter_id", 'court_id', 'court_name', 'jurisdiction_id',
        'jurisdiction_name'
    ]

    output = csv.DictWriter(csvfile, fieldnames=fieldnames)
    output.writeheader()

    for state in ['North Carolina', 'Arkansas', 'Illinois', 'New Mexico']:
        with lzma.open(f'{state}-20200302-xml/data/data.jsonl.xz',
                       mode='r') as in_file:
            for line in in_file:
                case = json.loads(str(line, 'utf8'))

                try:
                    decision_date = parser.parse(case['decision_date'])
                except ParserError as e:  # if date is out of range, parse year & month
                    decision_date = parser.parse(case['decision_date'][:7])

                case['decision_year'] = decision_date.year
                case['decision_month'] = decision_date.month

                for i, citation in enumerate(case['citations']):
                    if i > 3:
                        print(f"more than {len(case['citations'])} citations")
                        break
        help='Column that contains the first document of the document pairs',
        default=0,
        type=int)
    parser.add_argument(
        '--column2',
        help='Column that contains the second document of the document pairs',
        default=1,
        type=int)

    args = parser.parse_args()

    lang2_docs = set()
    lang2_read_docs = {}

    if args.indices[:-3] == '.xz':
        reader = lzma.open(args.indices, 'rt')
    elif args.indices[:-3] == '.gz':
        reader = gzip.open(args.indices, 'rt')
    else:
        reader = open(args.indices, 'r')

    for line in reader:
        fields = line.split('\t')
        lang2_docs.add(int(fields[args.column2]))

    reader.seek(0)

    with open_xz_or_gzip_or_plain(args.tokenized1) as tok_reader1, \
            open_xz_or_gzip_or_plain(args.tokenized2) as tok_reader2, \
            open_xz_or_gzip_or_plain(args.text1) as text_reader1, \
            open_xz_or_gzip_or_plain(args.text2) as text_reader2:
Example #50
0
    def get_node(self, disconnect=None, options=None, may_fail=False,
                 may_reconnect=False, random_hsm=False,
                 feerates=(15000, 7500, 3750), start=True, log_all_io=False,
                 dbfile=None, node_id=None, allow_broken_log=False,
                 wait_for_bitcoind_sync=True, allow_bad_gossip=False):
        if not node_id:
            node_id = self.get_node_id()

        port = self.get_next_port()

        lightning_dir = os.path.join(
            self.directory, "lightning-{}/".format(node_id))

        if os.path.exists(lightning_dir):
            shutil.rmtree(lightning_dir)

        socket_path = os.path.join(lightning_dir, "lightning-rpc").format(node_id)
        daemon = LightningD(
            lightning_dir, bitcoindproxy=self.bitcoind.get_proxy(),
            port=port, random_hsm=random_hsm, node_id=node_id
        )
        # If we have a disconnect string, dump it to a file for daemon.
        if disconnect:
            daemon.disconnect_file = os.path.join(lightning_dir, "dev_disconnect")
            with open(daemon.disconnect_file, "w") as f:
                f.write("\n".join(disconnect))
            daemon.opts["dev-disconnect"] = "dev_disconnect"
        if log_all_io:
            assert DEVELOPER
            daemon.env["LIGHTNINGD_DEV_LOG_IO"] = "1"
            daemon.opts["log-level"] = "io"
        if DEVELOPER:
            daemon.opts["dev-fail-on-subdaemon-fail"] = None
            daemon.env["LIGHTNINGD_DEV_MEMLEAK"] = "1"
            if os.getenv("DEBUG_SUBD"):
                daemon.opts["dev-debugger"] = os.getenv("DEBUG_SUBD")
            if VALGRIND:
                daemon.env["LIGHTNINGD_DEV_NO_BACKTRACE"] = "1"
            if not may_reconnect:
                daemon.opts["dev-no-reconnect"] = None

        if options is not None:
            daemon.opts.update(options)

        rpc = LightningRpc(socket_path, self.executor)

        node = LightningNode(daemon, rpc, self.bitcoind, self.executor, may_fail=may_fail,
                             may_reconnect=may_reconnect, allow_broken_log=allow_broken_log,
                             allow_bad_gossip=allow_bad_gossip)

        # Regtest estimatefee are unusable, so override.
        node.set_feerates(feerates, False)

        self.nodes.append(node)
        if VALGRIND:
            node.daemon.cmd_prefix = [
                'valgrind',
                '-q',
                '--trace-children=yes',
                '--trace-children-skip=*python*,*bitcoin-cli*',
                '--error-exitcode=7',
                '--log-file={}/valgrind-errors.%p'.format(node.daemon.lightning_dir)
            ]

        if dbfile:
            out = open(os.path.join(node.daemon.lightning_dir, 'lightningd.sqlite3'), 'xb')
            with lzma.open(os.path.join('tests/data', dbfile), 'rb') as f:
                out.write(f.read())

        if start:
            try:
                node.start(wait_for_bitcoind_sync)
            except Exception:
                node.daemon.stop()
                raise
        return node
Example #51
0
    def run(self):
        """Runs the UpdateThread
        """
        if self._adb is None:
            self.onError.emit('ADB not set')
            return

        if not self._adb.min_required:
            self.onError.emit('ADB MinRequired')
            return

        if not utils.is_connected():
            self.onError.emit('Not connected')
            return

        if self._frida_update_url is None or self._frida_update_url == '':
            self.onError.emit('Missing frida download url')
            return

        self.onStatusUpdate.emit('Downloading latest frida')

        try:
            if utils.is_connected():
                request = requests.get(self._frida_update_url, stream=True)
            else:
                self.onError.emit('Not connected')
                return
        except requests.ConnectionError:
            self.onError.emit('Failed to download latest frida')
            return

        # reset url
        self._frida_update_url = None

        if request is not None and request.status_code == 200:
            # write data to local file
            try:
                with open('frida.xz', 'wb') as frida_archive:
                    for chunk in request.iter_content(chunk_size=1024):
                        if chunk:
                            frida_archive.write(chunk)
            except EnvironmentError:
                self.onError.emit('Failed to write frida.xz')
                return

            # start extraction
            if os.path.exists('frida.xz'):
                self.onStatusUpdate.emit('Extracting latest frida')
                try:
                    with lzma.open('frida.xz') as frida_archive:
                        with open('frida-server', 'wb') as frida_binary:
                            frida_binary.write(frida_archive.read())

                    # remove downloaded archive
                    os.remove('frida.xz')
                except lzma.LZMAError:
                    self.onError.emit('Failed to extract frida.xz')
                    return
                except EnvironmentError:
                    self.onError.emit('Failed to write frida')
                    return
            else:
                self.onError.emit('Failed to open frida.xz')
                return

            self.onStatusUpdate.emit('Mounting devices filesystem')
            # mount system rw
            if self._adb.mount_system():
                self.onStatusUpdate.emit('Pushing to device')
                # push file to device
                self._adb.push('frida-server', '/sdcard/')
                self.onStatusUpdate.emit('Setting up and starting frida')
                # kill frida
                self._adb.kill_frida()

                _device_path = '/system/xbin'
                res = self._adb.su_cmd('ls ' + _device_path)
                if 'No such file or directory' in res:
                    # use /system/bin
                    _device_path = _device_path.replace('x', '')

                # copy file note: mv give sometimes a invalid id error
                self._adb.su_cmd('cp /sdcard/frida-server ' + _device_path +
                                 '/frida-server')
                # remove file
                self._adb.su_cmd('rm ' + _device_path +
                                 '/frida')  # remove old named file
                self._adb.su_cmd('rm /sdcard/frida-server')

                # just to make sure
                self._adb.su_cmd('chown root:root ' + _device_path +
                                 '/frida-server')
                # make it executable
                self._adb.su_cmd('chmod 06755 ' + _device_path +
                                 '/frida-server')

                # start it
                if self._adb.get_frida_version():
                    if not self._adb.start_frida():
                        self.onError.emit(
                            'Failed to start fridaserver on Device')
            else:
                print('failed to mount /system on device')

            # delete extracted file
            if os.path.exists('frida-server'):
                os.remove('frida-server')
        else:
            self.onError.emit('Failed to download latest frida! Error: %d' %
                              request.status_code)
            return

        self.onFinished.emit()
def RequestThread(ReqType, ReqMsg):
    global isConnected
    record = []
    if isConnected:
        startTime = datetime.now()
        record.append('\'' + str(startTime))
        record.append(ReqType)
        try:
            #Send request to server
            clientSocket.send(ReqMsg.encode())
        except socket.error as msg:
            now = str(datetime.now())[:-7]
            LoggingText.insert(
                'insert',
                '{0}: Server Connected failed({1})\n'.format(now, msg))
            isConnected = False
        else:
            LoggingText.insert('insert', '{} request sent\n'.format(ReqType))
            #Receive message from server
            response = clientSocket.recv(1024)
            if response:
                LoggingText.insert(
                    'insert', 'Response from server: {0} \n'.format(
                        response.decode('utf-8')))
                expectedResponse = '{} request accepted'.format(ReqType)
                if response.decode() == expectedResponse:
                    filepath = SourceFilePathVar.get()
                    filename = os.path.basename(filepath)
                    filesize = os.stat(filepath).st_size
                    record.append(filesize)
                    if os.path.isfile(filepath):

                        if IsCompressedVar.get() == 1:
                            zipStartTime = datetime.now()
                            #Zipfile compression
                            # zipfilename = filename.split('.')[0] + '.zip'
                            # with zipfile.ZipFile(zipfilename, 'w', zipfile.ZIP_DEFLATED) as f:
                            #     f.write(filename)
                            #lzma compression
                            zipfilename = filename.split('.')[0] + '.xz'
                            with lzma.open(zipfilename, 'wb') as f:
                                with open(filename, 'rb') as pf:
                                    textContent = pf.read()
                                f.write(textContent)
                            filepath = zipfilename
                            filename = zipfilename
                            filesize = os.stat(filepath).st_size
                            record.append(filesize)
                            zipDuration = datetime.now() - zipStartTime
                            record.append('\'' + str(zipDuration))
                        else:
                            record.append('None')
                            record.append(0)

                        #Send file info to server
                        #Header structure : file name lentgh = 128 bytes; filesize = 8bytes; IsCompressed = 4bytes(int)
                        fhead = struct.pack('128sQI',
                                            bytes(filename.encode('utf-8')),
                                            filesize, IsCompressedVar.get())
                        clientSocket.send(fhead)
                        LoggingText.insert(
                            'insert', '{} file header sent\n'.format(ReqType))

                        sendStartTime = datetime.now()
                        #Send data to server
                        with open(filepath, 'rb') as fp:
                            data = fp.read()
                            clientSocket.sendall(data)
                            LoggingText.insert(
                                'insert',
                                '{} file send over...\n'.format(ReqType))
                        sendoverTime = datetime.now()
                        sendoverDetal = sendoverTime - sendStartTime
                        record.append('\'' + str(sendoverDetal))

                        LoggingText.insert(
                            'insert',
                            'Waiting for server processing and feedback\n')

                        rcvStartTime = datetime.now()
                        #4. Receive the processed result
                        fileinfo_size = struct.calcsize('128sQI')
                        fileinfo_data = clientSocket.recv(fileinfo_size)

                        if fileinfo_data:
                            filename, filesize, IsCompressed = struct.unpack(
                                '128sQI', fileinfo_data)
                            rcv_file_name = filename.decode('utf-8').strip(
                                '\x00')
                            LoggingText.insert(
                                'insert',
                                'Processed file header info is received for {}\n'
                                .format(ReqType))

                            received_size = 0
                            with open(rcv_file_name, 'wb') as rcv_file_handle:
                                while not (received_size == filesize):
                                    if (filesize - received_size > 4096):
                                        data = clientSocket.recv(4096)
                                        if data:
                                            received_size += len(data)
                                        else:
                                            isConnected = False
                                            break
                                    else:
                                        data = clientSocket.recv(filesize -
                                                                 received_size)
                                        if data:
                                            received_size = filesize
                                        else:
                                            isConnected = False
                                            break
                                    rcv_file_handle.write(data)
                            LoggingText.insert(
                                'insert',
                                'Processed file for {} is received\n'.format(
                                    ReqType))

                            reverseoverDetal = datetime.now() - rcvStartTime
                            record.append('\'' + str(reverseoverDetal))
                            if isConnected:
                                if IsCompressed:
                                    LoggingText.insert(
                                        'insert',
                                        'Processed file for {} was compressed\n'
                                        .format(ReqType))
                                    unzipStartTime = datetime.now()
                                    # with zipfile.ZipFile(rcv_file_name, 'r') as zf:
                                    #     filepath = zf.extract(zf.namelist()[0]) #suppose only one file
                                    #     #rcv_file_name = os.path.basename(filepath)
                                    #     rcv_file_name = filepath
                                    with lzma.open(rcv_file_name, 'rb') as f:
                                        zipContent = f.read()
                                        localFileName = 'ReceivedProcessedFor{}.txt'.format(
                                            ReqType)
                                        with open(localFileName, 'w') as uf:
                                            uf.write(
                                                zipContent.decode("utf-8"))
                                        #Dsiplay partial content in GUI
                                        ProcessedFileText.delete(1.0, 'end')
                                        ProcessedFileText.insert(
                                            'insert',
                                            zipContent.decode("utf-8")[0:1000])
                                    unzipDuration = datetime.now(
                                    ) - unzipStartTime
                                    record.append('\'' + str(unzipDuration))
                                    LoggingText.insert(
                                        'insert',
                                        'Processed file for {} is decompressed\n'
                                        .format(ReqType))
                                else:
                                    record.append(0)

                                total_duration = datetime.now() - startTime
                                record.append('\'' + str(total_duration))

                                #with open(rcv_file_name,'rb') as rf:
                                #    all_data_str = rf.read().decode('utf-8')

                                #LoggingText.insert('insert', 'Processed file is stored locally\n')
                                #5. Display the replaced result
                                #ProcessedFileText.delete(1.0,'end')
                                #ProcessedFileText.insert('insert', all_data_str[0:2000])

                                #ProcessedFileText.insert('insert', 'Processed data received')
                            else:
                                LoggingText.insert(
                                    'insert',
                                    'No connection! Please connect firstly\n')
                    else:
                        LoggingText.insert('insert',
                                           'The file path is not valid')
            else:
                isConnected = False
                LoggingText.insert('insert',
                                   'No connection! Please connect firstly\n')
    else:
        LoggingText.insert('insert', 'No connection! Please connect firstly\n')
    print('Request thread for {} ended'.format(ReqType))
    with open('record.csv', 'a+') as csv_record:
        csv_write = csv.writer(csv_record)
        csv_write.writerow(record)
Example #53
0
    colorama.init(autoreset=True, strip=False)
    if args.dark:
        color_type = colorama.Fore.BLUE + colorama.Style.BRIGHT
        color_obj = colorama.Fore.YELLOW + colorama.Style.NORMAL
    else:
        color_type = colorama.Fore.BLUE + colorama.Style.NORMAL
        color_obj = colorama.Fore.YELLOW + colorama.Style.DIM

# next-char values which will trigger ignoreself
ignorechars = set([':', '.'])

# Loop through and search
with os.scandir(os.path.join('resources', game, 'dumps')) as it:
    for entry in sorted(it, key=lambda e: getattr(e, 'name').lower()):
        if entry.name[-8:] == '.dump.xz' or entry.name[-7:] == '.txt.xz':
            with lzma.open(entry.path, 'rt', encoding='latin1') as df:
                cur_obj = None
                cur_type = None
                found_result = False
                for line in df.readlines():
                    match = re.search(
                        '\*\*\* Property dump for object \'(\S+) (\S+?)\' \*\*\*',
                        line)
                    if match:
                        cur_type = match.group(1)
                        cur_obj = match.group(2)
                        if args.ignoreself and cur_obj.lower().startswith(
                                ignore_search_str):
                            if len(cur_obj) > len(ignore_search_str):
                                if cur_obj[len(
                                        ignore_search_str)] in ignorechars:
Example #54
0
def save_compressed(filename):
    #create lzma compressed version
    xz_filename = filename + '.xz'
    with lzma.open(xz_filename, 'wt', preset=9) as f:
        with open(filename, 'r') as content_file:
            f.write(content_file.read())
Example #55
0
def main():

    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    parser.add_argument('query',
                        nargs="*",
                        help='Query fasta file of sequences to analyse.')
    parser.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    parser.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    parser.add_argument('--alignment',
                        action="store_true",
                        help="Optional alignment output.")
    parser.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing a fasta alignment and guide tree")
    parser.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    parser.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    parser.add_argument(
        '--decompress-model',
        action="store_true",
        dest="decompress",
        help=
        "Permanently decompress the model file to save time running pangolin.")
    parser.add_argument(
        '--max-ambig',
        action="store",
        default=0.5,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",
        dest="maxambig")
    parser.add_argument(
        '--min-length',
        action="store",
        default=25000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 25000",
        dest="minlen")
    parser.add_argument('--panGUIlin',
                        action='store_true',
                        help="Run web-app version of pangolin",
                        dest="panGUIlin")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Print lots of stuff to screen")
    parser.add_argument("-t",
                        "--threads",
                        action="store",
                        help="Number of threads")
    parser.add_argument("-v",
                        "--version",
                        action='version',
                        version=f"pangolin {__version__}")
    parser.add_argument("-pv",
                        "--pangoLEARN-version",
                        action='version',
                        version=f"pangoLEARN {pangoLEARN.__version__}",
                        help="show pangoLEARN's version number and exit")
    parser.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin and pangoLEARN, then exits"
    )

    compression = parser.add_mutually_exclusive_group()
    compression.add_argument("--gzip",
                             action="store_true",
                             help="Query files are gzip-compressed.")
    compression.add_argument("--xz",
                             action="store_true",
                             help="Query files are xz-compressed.")

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(-1)
    args = parser.parse_args()

    if args.update:
        update(__version__, pangoLEARN.__version__)

    snakefile = os.path.join(thisdir, 'scripts', 'pangolearn.smk')
    if not os.path.exists(snakefile):
        sys.stderr.write(
            'Error: cannot find Snakefile at {}\n'.format(snakefile))
        sys.exit(-1)

    pfunk.check_installs()

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments
    if len(args.query) > 1:
        print(
            pfunk.cyan(
                f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only"
            ))
        parser.print_help()
        sys.exit(-1)
    else:
        # find the query fasta
        query = os.path.join(cwd, args.query[0])
        if not os.path.exists(query):
            sys.stderr.write(
                'Error: cannot find query (input) fasta file at {}\n'
                'Please enter your fasta sequence file and refer to pangolin usage at:\n'
                'https://github.com/hCoV-2019/pangolin#usage\n'
                ' for detailed instructions\n'.format(query))
            sys.exit(-1)
        else:
            print(pfunk.green(f"The query file is:") + f"{query}")

    # default output dir
    if args.outdir:
        outdir = os.path.join(cwd, args.outdir)
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                sys.stderr.write(
                    pfunk.cyan(f'Error: cannot create directory:') +
                    f"{outdir}")
                sys.exit(-1)
    else:
        outdir = cwd

    if args.outfile:
        outfile = os.path.join(outdir, args.outfile)
    else:
        outfile = os.path.join(outdir, "lineage_report.csv")

    if args.tempdir:
        to_be_dir = os.path.join(cwd, args.tempdir)
        if not os.path.exists(to_be_dir):
            os.mkdir(to_be_dir)
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=to_be_dir)
        tempdir = temporary_directory.name
    else:
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=None)
        tempdir = temporary_directory.name

    if args.no_temp:
        print(
            pfunk.green(f"--no-temp:") +
            f"all intermediate files will be written to {outdir}")
        tempdir = outdir

    if args.alignment:
        align_dir = outdir
        alignment_out = True
    else:
        align_dir = tempdir
        alignment_out = False

    if args.threads:
        print(
            pfunk.cyan(
                f"\n--threads flag used, but threading not currently supported. Continuing with one thread."
            ))
    """
    QC steps:
    1) check no empty seqs
    2) check N content
    3) write a file that contains just the seqs to run
    """

    do_not_run = []
    run = []

    if args.gzip:
        # user says input FASTA file is gzip-compressed, use gzip module to stream text to SeqIO
        query = gzip.open(query, 'rt')  # replace file path (str) with handle
    if args.xz:
        query = lzma.open(query, 'rt')

    for record in SeqIO.parse(query, "fasta"):
        # replace spaces in sequence headers with underscores
        record.description = record.description.replace(' ', '_')
        record.id = record.description
        if "," in record.id:
            record.id = record.id.replace(",", "_")

        if len(record) < args.minlen:
            record.description = record.description + f" fail=seq_len:{len(record)}"
            do_not_run.append(record)
            print(record.id, "\tsequence too short")
        else:
            num_N = str(record.seq).upper().count("N")
            prop_N = round(num_N / len(record.seq), 2)
            if prop_N > args.maxambig:
                record.description = record.description + f" fail=N_content:{prop_N}"
                do_not_run.append(record)
                print(f"{record.id}\thas an N content of {prop_N}")
            else:
                run.append(record)

    if run == []:
        with open(outfile, "w") as fw:
            fw.write(
                "taxon,lineage,conflict,pangolin_version,pangoLEARN_version,pango_version,status,note\n"
            )
            for record in do_not_run:
                desc = record.description.split(" ")
                reason = ""
                for item in desc:
                    if item.startswith("fail="):
                        reason = item.split("=")[1]
                fw.write(
                    f"{record.id},None,NA,{__version__},{pangoLEARN.__version__},PANGO_VERSION,fail,{reason}\n"
                )
        print(pfunk.cyan(f'Note: no query sequences have passed the qc\n'))
        sys.exit(0)

    post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta')
    with open(post_qc_query, "w") as fw:
        SeqIO.write(run, fw, "fasta")
    qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta')
    with open(qc_fail, "w") as fw:
        SeqIO.write(do_not_run, fw, "fasta")

    config = {
        "query_fasta": post_qc_query,
        "outdir": outdir,
        "outfile": outfile,
        "tempdir": tempdir,
        "aligndir": align_dir,
        "alignment_out": alignment_out,
        "trim_start": 265,  # where to pad to using datafunk
        "trim_end": 29674,  # where to pad after using datafunk
        "qc_fail": qc_fail,
        "pangoLEARN_version": pangoLEARN.__version__,
        "pangolin_version": __version__,
        "pango_version": PANGO_VERSION
    }

    # find the data
    data_dir = ""
    if args.datadir:
        data_dir = os.path.join(cwd, args.datadir)
        version = "Unknown"
        for r, d, f in os.walk(data_dir):
            for fn in f:
                if fn == "__init__.py":
                    print("Found __init__.py")
                    with open(os.path.join(r, fn), "r") as fr:
                        for l in fr:
                            if l.startswith("__version__"):
                                l = l.rstrip("\n")
                                version = l.split('=')[1]
                                version = version.replace('"',
                                                          "").replace(" ", "")
                                print("pangoLEARN version", version)
        config["pangoLEARN_version"] = version

    if not args.datadir:
        pangoLEARN_dir = pangoLEARN.__path__[0]
        data_dir = os.path.join(pangoLEARN_dir, "data")
    print(f"Looking in {data_dir} for data files...")
    trained_model = ""
    header_file = ""
    lineages_csv = ""

    for r, d, f in os.walk(data_dir):
        for fn in f:
            if fn == "decisionTreeHeaders_v1.joblib":
                header_file = os.path.join(r, fn)
            elif fn == "decisionTree_v1.joblib":
                trained_model = os.path.join(r, fn)
            elif fn == "lineages.metadata.csv":
                lineages_csv = os.path.join(r, fn)
    if trained_model == "" or header_file == "" or lineages_csv == "":
        print(
            pfunk.cyan(
                "Check your environment, didn't find appropriate files from the pangoLEARN repo.\n"
                "Trained model must be installed, please see https://cov-lineages.org/pangolin.html "
                "for installation instructions."))
        exit(1)
    else:
        if args.decompress:
            prev_size = os.path.getsize(trained_model)

            print("Decompressing model and header files")
            model = joblib.load(trained_model)
            joblib.dump(model, trained_model, compress=0)
            headers = joblib.load(header_file)
            joblib.dump(headers, header_file, compress=0)

            if os.path.getsize(trained_model) >= prev_size:
                print(
                    pfunk.green(
                        f'Success! Decompressed the model file. Exiting\n'))
                sys.exit(0)
            else:
                print(
                    pfunk.cyan(
                        f'Error: failed to decompress model. Exiting\n'))
                sys.exit(0)

        print(pfunk.green("\nData files found"))
        print(f"Trained model:\t{trained_model}")
        print(f"Header file:\t{header_file}")
        print(f"Lineages csv:\t{lineages_csv}")
        config["trained_model"] = trained_model
        config["header_file"] = header_file

    reference_fasta = pkg_resources.resource_filename('pangolin',
                                                      'data/reference.fasta')
    config["reference_fasta"] = reference_fasta

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_b.1.1.7.csv')
    config["b117_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_b.1.351.csv')
    config["b1351_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_p.1.csv')
    config["p1_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_p.2.csv')
    config["p2_variants"] = variants_file

    variants_file = pkg_resources.resource_filename('pangolin',
                                                    'data/config_p.3.csv')
    config["p3_variants"] = variants_file

    if args.panGUIlin:
        config["lineages_csv"] = lineages_csv

    if args.verbose:
        quiet_mode = False
        config["log_string"] = ""
    else:
        quiet_mode = True
        lh_path = os.path.realpath(lh.__file__)
        config["log_string"] = f"--quiet --log-handler-script {lh_path} "

    if args.verbose:
        print(pfunk.green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(pfunk.green(k), config[k])

        status = snakemake.snakemake(snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=1,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=1,
                                     lock=False,
                                     quiet=True,
                                     log_handler=logger.log_handler)

    if status:  # translate "success" into shell exit code of 0
        return 0

    return 1
Example #56
0
    [[0, 0], [0, 0], [0, 0]],
    [[0, 0], [0, 0], [0, 0]],
    [[0, 0], [0, 0], [0, 0]],
    [[0, 0], [0, 0], [0, 0]],
]

reach_turn_points = [[0] * TURNS, [0] * TURNS, [0] * TURNS, [0] * TURNS,
                     [0] * TURNS]
reach_turn_counts = [[0] * TURNS, [0] * TURNS, [0] * TURNS, [0] * TURNS,
                     [0] * TURNS]

outcome_names = ('I won', 'Draw', 'Bystander', 'Other tsumod', 'I dealt in',
                 'Averages')
for player in account_names:
    counter.player = player
    with lzma.open(directory_name + player + '.pickle.7z', 'rb') as infile:
        logs = pickle.load(infile)

    for key, log in logs.items():
        if args.since and args.since > key[0:8]:
            continue
        if args.before and args.before <= key[0:8]:
            continue
        gamecount += 1
        game = TenhouDecoder.Game(lang='DEFAULT', suppress_draws=False)
        game.decode(log['content'].decode())
        counter.reach_outcomes = []
        counter.addGame(game)

        for outcome in counter.reach_outcomes:
            # aggregate counter.reach_outcomes
Example #57
0
def main():
    module = AnsibleModule(
        argument_spec=dict(
            path=dict(type='list', required=True),
            format=dict(type='str',
                        default='gz',
                        choices=['bz2', 'gz', 'tar', 'xz', 'zip']),
            dest=dict(type='path'),
            exclude_path=dict(type='list'),
            force_archive=dict(type='bool', default=False),
            remove=dict(type='bool', default=False),
        ),
        add_file_common_args=True,
        supports_check_mode=True,
    )

    params = module.params
    check_mode = module.check_mode
    paths = params['path']
    dest = params['dest']
    b_dest = None if not dest else to_bytes(dest, errors='surrogate_or_strict')
    exclude_paths = params['exclude_path']
    remove = params['remove']

    b_expanded_paths = []
    b_expanded_exclude_paths = []
    fmt = params['format']
    b_fmt = to_bytes(fmt, errors='surrogate_or_strict')
    force_archive = params['force_archive']
    globby = False
    changed = False
    state = 'absent'

    # Simple or archive file compression (inapplicable with 'zip' since it's always an archive)
    archive = False
    b_successes = []

    # Fail early
    if not HAS_LZMA and fmt == 'xz':
        module.fail_json(msg=missing_required_lib(
            "lzma or backports.lzma", reason="when using xz format"),
                         exception=LZMA_IMP_ERR)
        module.fail_json(
            msg="lzma or backports.lzma is required when using xz format.")

    for path in paths:
        b_path = os.path.expanduser(
            os.path.expandvars(to_bytes(path, errors='surrogate_or_strict')))

        # Expand any glob characters. If found, add the expanded glob to the
        # list of expanded_paths, which might be empty.
        if (b'*' in b_path or b'?' in b_path):
            b_expanded_paths.extend(glob.glob(b_path))
            globby = True

        # If there are no glob characters the path is added to the expanded paths
        # whether the path exists or not
        else:
            b_expanded_paths.append(b_path)

    # Only attempt to expand the exclude paths if it exists
    if exclude_paths:
        for exclude_path in exclude_paths:
            b_exclude_path = os.path.expanduser(
                os.path.expandvars(
                    to_bytes(exclude_path, errors='surrogate_or_strict')))

            # Expand any glob characters. If found, add the expanded glob to the
            # list of expanded_paths, which might be empty.
            if (b'*' in b_exclude_path or b'?' in b_exclude_path):
                b_expanded_exclude_paths.extend(glob.glob(b_exclude_path))

                # If there are no glob character the exclude path is added to the expanded
                # exclude paths whether the path exists or not.
            else:
                b_expanded_exclude_paths.append(b_exclude_path)

    if not b_expanded_paths:
        return module.fail_json(path=', '.join(paths),
                                expanded_paths=to_native(
                                    b', '.join(b_expanded_paths),
                                    errors='surrogate_or_strict'),
                                msg='Error, no source paths were found')

    # Only try to determine if we are working with an archive or not if we haven't set archive to true
    if not force_archive:
        # If we actually matched multiple files or TRIED to, then
        # treat this as a multi-file archive
        archive = globby or os.path.isdir(
            b_expanded_paths[0]) or len(b_expanded_paths) > 1
    else:
        archive = True

    # Default created file name (for single-file archives) to
    # <file>.<format>
    if not b_dest and not archive:
        b_dest = b'%s.%s' % (b_expanded_paths[0], b_fmt)

    # Force archives to specify 'dest'
    if archive and not b_dest:
        module.fail_json(
            dest=dest,
            path=', '.join(paths),
            msg=
            'Error, must specify "dest" when archiving multiple files or trees'
        )

    b_sep = to_bytes(os.sep, errors='surrogate_or_strict')

    b_archive_paths = []
    b_missing = []
    b_arcroot = b''

    for b_path in b_expanded_paths:
        # Use the longest common directory name among all the files
        # as the archive root path
        if b_arcroot == b'':
            b_arcroot = os.path.dirname(b_path) + b_sep
        else:
            for i in range(len(b_arcroot)):
                if b_path[i] != b_arcroot[i]:
                    break

            if i < len(b_arcroot):
                b_arcroot = os.path.dirname(b_arcroot[0:i + 1])

            b_arcroot += b_sep

        # Don't allow archives to be created anywhere within paths to be removed
        if remove and os.path.isdir(b_path):
            b_path_dir = b_path
            if not b_path.endswith(b'/'):
                b_path_dir += b'/'

            if b_dest.startswith(b_path_dir):
                module.fail_json(
                    path=', '.join(paths),
                    msg=
                    'Error, created archive can not be contained in source paths when remove=True'
                )

        if os.path.lexists(b_path) and b_path not in b_expanded_exclude_paths:
            b_archive_paths.append(b_path)
        else:
            b_missing.append(b_path)

    # No source files were found but the named archive exists: are we 'compress' or 'archive' now?
    if len(b_missing) == len(b_expanded_paths) and b_dest and os.path.exists(
            b_dest):
        # Just check the filename to know if it's an archive or simple compressed file
        if re.search(
                br'(\.tar|\.tar\.gz|\.tgz|\.tbz2|\.tar\.bz2|\.tar\.xz|\.zip)$',
                os.path.basename(b_dest), re.IGNORECASE):
            state = 'archive'
        else:
            state = 'compress'

    # Multiple files, or globbiness
    elif archive:
        if not b_archive_paths:
            # No source files were found, but the archive is there.
            if os.path.lexists(b_dest):
                state = 'archive'
        elif b_missing:
            # SOME source files were found, but not all of them
            state = 'incomplete'

        archive = None
        size = 0
        errors = []

        if os.path.lexists(b_dest):
            size = os.path.getsize(b_dest)

        if state != 'archive':
            if check_mode:
                changed = True

            else:
                try:
                    # Slightly more difficult (and less efficient!) compression using zipfile module
                    if fmt == 'zip':
                        arcfile = zipfile.ZipFile(
                            to_native(b_dest,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'), 'w',
                            zipfile.ZIP_DEFLATED, True)

                    # Easier compression using tarfile module
                    elif fmt == 'gz' or fmt == 'bz2':
                        arcfile = tarfile.open(
                            to_native(b_dest,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'), 'w|' + fmt)

                    # python3 tarfile module allows xz format but for python2 we have to create the tarfile
                    # in memory and then compress it with lzma.
                    elif fmt == 'xz':
                        arcfileIO = io.BytesIO()
                        arcfile = tarfile.open(fileobj=arcfileIO, mode='w')

                    # Or plain tar archiving
                    elif fmt == 'tar':
                        arcfile = tarfile.open(
                            to_native(b_dest,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'), 'w')

                    b_match_root = re.compile(br'^%s' % re.escape(b_arcroot))
                    for b_path in b_archive_paths:
                        if os.path.isdir(b_path):
                            # Recurse into directories
                            for b_dirpath, b_dirnames, b_filenames in os.walk(
                                    b_path, topdown=True):
                                if not b_dirpath.endswith(b_sep):
                                    b_dirpath += b_sep

                                for b_dirname in b_dirnames:
                                    b_fullpath = b_dirpath + b_dirname
                                    n_fullpath = to_native(
                                        b_fullpath,
                                        errors='surrogate_or_strict',
                                        encoding='ascii')
                                    n_arcname = to_native(
                                        b_match_root.sub(b'', b_fullpath),
                                        errors='surrogate_or_strict')

                                    try:
                                        if fmt == 'zip':
                                            arcfile.write(
                                                n_fullpath, n_arcname)
                                        else:
                                            arcfile.add(n_fullpath,
                                                        n_arcname,
                                                        recursive=False)

                                    except Exception as e:
                                        errors.append(
                                            '%s: %s' %
                                            (n_fullpath, to_native(e)))

                                for b_filename in b_filenames:
                                    b_fullpath = b_dirpath + b_filename
                                    n_fullpath = to_native(
                                        b_fullpath,
                                        errors='surrogate_or_strict',
                                        encoding='ascii')
                                    n_arcname = to_native(
                                        b_match_root.sub(b'', b_fullpath),
                                        errors='surrogate_or_strict')

                                    if not filecmp.cmp(b_fullpath, b_dest):
                                        try:
                                            if fmt == 'zip':
                                                arcfile.write(
                                                    n_fullpath, n_arcname)
                                            else:
                                                arcfile.add(n_fullpath,
                                                            n_arcname,
                                                            recursive=False)

                                            b_successes.append(b_fullpath)
                                        except Exception as e:
                                            errors.append('Adding %s: %s' %
                                                          (to_native(b_path),
                                                           to_native(e)))
                        else:
                            path = to_native(b_path,
                                             errors='surrogate_or_strict',
                                             encoding='ascii')
                            arcname = to_native(b_match_root.sub(b'', b_path),
                                                errors='surrogate_or_strict')
                            if fmt == 'zip':
                                arcfile.write(path, arcname)
                            else:
                                arcfile.add(path, arcname, recursive=False)

                            b_successes.append(b_path)

                except Exception as e:
                    expanded_fmt = 'zip' if fmt == 'zip' else ('tar.' + fmt)
                    module.fail_json(
                        msg='Error when writing %s archive at %s: %s' %
                        (expanded_fmt, dest, to_native(e)),
                        exception=format_exc())

                if arcfile:
                    arcfile.close()
                    state = 'archive'

                if fmt == 'xz':
                    with lzma.open(b_dest, 'wb') as f:
                        f.write(arcfileIO.getvalue())
                    arcfileIO.close()

                if errors:
                    module.fail_json(
                        msg='Errors when writing archive at %s: %s' %
                        (dest, '; '.join(errors)))

        if state in ['archive', 'incomplete'] and remove:
            for b_path in b_successes:
                try:
                    if os.path.isdir(b_path):
                        shutil.rmtree(b_path)
                    elif not check_mode:
                        os.remove(b_path)
                except OSError as e:
                    errors.append(to_native(b_path))

            if errors:
                module.fail_json(dest=dest,
                                 msg='Error deleting some source files: ',
                                 files=errors)

        # Rudimentary check: If size changed then file changed. Not perfect, but easy.
        if not check_mode and os.path.getsize(b_dest) != size:
            changed = True

        if b_successes and state != 'incomplete':
            state = 'archive'

    # Simple, single-file compression
    else:
        b_path = b_expanded_paths[0]

        # No source or compressed file
        if not (os.path.exists(b_path) or os.path.lexists(b_dest)):
            state = 'absent'

        # if it already exists and the source file isn't there, consider this done
        elif not os.path.lexists(b_path) and os.path.lexists(b_dest):
            state = 'compress'

        else:
            if module.check_mode:
                if not os.path.exists(b_dest):
                    changed = True
            else:
                size = 0
                f_in = f_out = arcfile = None

                if os.path.lexists(b_dest):
                    size = os.path.getsize(b_dest)

                try:
                    if fmt == 'zip':
                        arcfile = zipfile.ZipFile(
                            to_native(b_dest,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'), 'w',
                            zipfile.ZIP_DEFLATED, True)
                        arcfile.write(
                            to_native(b_path,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'),
                            to_native(b_path[len(b_arcroot):],
                                      errors='surrogate_or_strict'))
                        arcfile.close()
                        state = 'archive'  # because all zip files are archives
                    elif fmt == 'tar':
                        arcfile = tarfile.open(
                            to_native(b_dest,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'), 'w')
                        arcfile.add(
                            to_native(b_path,
                                      errors='surrogate_or_strict',
                                      encoding='ascii'))
                        arcfile.close()
                    else:
                        f_in = open(b_path, 'rb')

                        n_dest = to_native(b_dest,
                                           errors='surrogate_or_strict',
                                           encoding='ascii')
                        if fmt == 'gz':
                            f_out = gzip.open(n_dest, 'wb')
                        elif fmt == 'bz2':
                            f_out = bz2.BZ2File(n_dest, 'wb')
                        elif fmt == 'xz':
                            f_out = lzma.LZMAFile(n_dest, 'wb')
                        else:
                            raise OSError("Invalid format")

                        shutil.copyfileobj(f_in, f_out)

                    b_successes.append(b_path)

                except OSError as e:
                    module.fail_json(
                        path=to_native(b_path),
                        dest=dest,
                        msg='Unable to write to compressed file: %s' %
                        to_native(e),
                        exception=format_exc())

                if arcfile:
                    arcfile.close()
                if f_in:
                    f_in.close()
                if f_out:
                    f_out.close()

                # Rudimentary check: If size changed then file changed. Not perfect, but easy.
                if os.path.getsize(b_dest) != size:
                    changed = True

            state = 'compress'

        if remove and not check_mode:
            try:
                os.remove(b_path)

            except OSError as e:
                module.fail_json(path=to_native(b_path),
                                 msg='Unable to remove source file: %s' %
                                 to_native(e),
                                 exception=format_exc())

    params['path'] = b_dest
    file_args = module.load_file_common_arguments(params)

    if not check_mode:
        changed = module.set_fs_attributes_if_different(file_args, changed)

    module.exit_json(
        archived=[
            to_native(p, errors='surrogate_or_strict') for p in b_successes
        ],
        dest=dest,
        changed=changed,
        state=state,
        arcroot=to_native(b_arcroot, errors='surrogate_or_strict'),
        missing=[
            to_native(p, errors='surrogate_or_strict') for p in b_missing
        ],
        expanded_paths=[
            to_native(p, errors='surrogate_or_strict')
            for p in b_expanded_paths
        ],
        expanded_exclude_paths=[
            to_native(p, errors='surrogate_or_strict')
            for p in b_expanded_exclude_paths
        ],
    )
Example #58
0
def xz_file(tmp_path_factory):
    filename = tmp_path_factory.mktemp("data") / "file.xz"
    data = bytes(FILE_CONTENT, "utf-8")
    with lzma.open(filename, "wb") as f:
        f.write(data)
    return filename
Example #59
0
    def download_pdb_isf(
            self,
            guid: str,
            age: int,
            pdb_name: str,
            progress_callback: constants.ProgressCallback = None) -> None:
        """Attempts to download the PDB file, convert it to an ISF file and
        save it to one of the symbol locations."""
        # Check for writability
        filter_string = os.path.join(pdb_name, guid + "-" + str(age))
        for path in symbols.__path__:

            # Store any temporary files created by downloading PDB files
            tmp_files = []
            potential_output_filename = os.path.join(
                path, "windows", filter_string + ".json.xz")
            data_written = False
            try:
                os.makedirs(os.path.dirname(potential_output_filename),
                            exist_ok=True)
                with lzma.open(potential_output_filename, "w") as of:
                    # Once we haven't thrown an error, do the computation
                    filename = pdbconv.PdbRetreiver().retreive_pdb(
                        guid + str(age),
                        file_name=pdb_name,
                        progress_callback=progress_callback)
                    if filename:
                        tmp_files.append(filename)
                        location = "file:" + request.pathname2url(
                            tmp_files[-1])
                        json_output = pdbconv.PdbReader(
                            self.context, location,
                            progress_callback).get_json()
                        of.write(
                            bytes(
                                json.dumps(json_output,
                                           indent=2,
                                           sort_keys=True), 'utf-8'))
                        # After we've successfully written it out, record the fact so we don't clear it out
                        data_written = True
                    else:
                        vollog.warning(
                            "Symbol file could not be found on remote server" +
                            (" " * 100))
                break
            except PermissionError:
                vollog.warning(
                    "Cannot write necessary symbol file, please check permissions on {}"
                    .format(potential_output_filename))
                continue
            finally:
                # If something else failed, removed the symbol file so we don't pick it up in the future
                if not data_written and os.path.exists(
                        potential_output_filename):
                    os.remove(potential_output_filename)
                # Clear out all the temporary file if we constructed one
                for filename in tmp_files:
                    try:
                        os.remove(filename)
                    except PermissionError:
                        vollog.warning(
                            "Temporary file could not be removed: {}".format(
                                filename))
        else:
            vollog.warning(
                "Cannot write downloaded symbols, please add the appropriate symbols"
                " or add/modify a symbols directory that is writable")
Example #60
0
def resolve_all_links_and_redirects(encoding='utf-8'):
    page_file_name = os.path.join(config.which_wiki, 'pages.lzma')
    link_file_name = os.path.join(config.which_wiki, 'pagelinks.lzma')
    redirect_file_name = os.path.join(config.which_wiki, 'redirects.lzma')

    page_id_to_title = {}
    page_title_to_id = {}
    resolved_ids = set()
    resolved_title_to_id = {}  # the final result of this work
    with lzma.open(page_file_name) as page_file:
        pages = page_file.read().decode(encoding, errors='replace').split('\n')
        count = 0
        for page in pages:
            tup = tuple(entry for entry in page.split('\t'))
            if len(tup) > 1:
                page_id_to_title[int(tup[0])] = tup[1]  # gonna go from id to title
            # so that redirects go from their own title to the redirected id
                page_title_to_id[tup[1]] = int(tup[0])
            if len(tup) > 3:
                if int(tup[3]) == 0:
                    resolved_ids.add(int(tup[0]))
                    resolved_title_to_id[tup[1]] = int(tup[0])

    print ("Page file read in.  Length: {}".format(len(page_id_to_title)))
    print ("Number of resolved IDs: {}".format(len(resolved_ids)))
    num_redirect_failures = 0
    num_redirect_successes = 0
    with lzma.open(redirect_file_name) as redirect_file:
        redirects = redirect_file.read().decode(encoding, errors='replace').split('\n')
        # count = 0
        for redirect in redirects:
            tup = tuple(entry for entry in redirect.split('\t'))
            # file is actually organized as 'redirect page id, final page title'
            # that has to be unraveled
            # to do that, use previously read in pages to get the page title
            # and map the page_title_to_id title to the redirected title to redirected id
            # problem! there can be multiple levels of redirects.  gah.
            if len(tup) > 1:
                # print (tup)
                try:
                    original_page_title = page_id_to_title[int(tup[0])]
                except:
                    continue
                # print (original_page_title)
                # have to follow the rabbit hole down to find the vast majority of redirects
                # of redirects.  That means looking until there's a hit in the title_to_id set
                try:
                    redirect_count = 0
                    current_title = tup[1]
                    current_id = int(page_title_to_id[current_title])
                    while current_id not in resolved_ids and redirect_count < 50:  # atrociously large, in case there are cycles
                        current_title = page_id_to_title[current_id]
                        current_id = int(page_title_to_id[current_title])
                        redirect_count += 1
                    if original_page_title not in resolved_title_to_id:
                        resolved_title_to_id[original_page_title] = current_id
                    else:
                        if current_id != resolved_title_to_id[original_page_title]:
                            print ("Already resolved {} to {} not {}".format(tup[1],
                                                                             resolved_title_to_id[original_page_title],
                                                                             current_id))
                    num_redirect_successes += 1
                except Exception as e:
                    num_redirect_failures += 1
                    old_print_statements = '''
                    print ("Tup {} broke after {} redirect disentanglements with exception {}".format(tup, redirect_count, sys.exc_info()[0]))
                    print ("Current title {} and current id {}".format(tup[1], current_id))
                    try:
                        print("Trying to print the page title {}".format(page_id_to_title[int(tup[0])]))
                        print("Could try {} as the final id".format(page_title_to_id[tup[1]]))
                        print("That id produces article tuple {}".format(all_articles[page_title_to_id[tup[1]]]))
                        print("Trying to print final id {}".format(title_to_id[tup[1]]))
                    except:
                       pass
                            '''

    print ("Redirects complete. {} succeeded, {} failed.".format(num_redirect_successes,
                                                                 num_redirect_failures))
    print ("Total resolved titles: {} vs total titles {}.".format(len(resolved_title_to_id),
                                                                  len(page_title_to_id)))

    # first, we go through the links, and build up a set of titles that they are linked to
    # then, we go through the page set and find the ids for those titles.
    # if the seed set size + the linked size is contained in the target size, expand the
    # seed set and repeat
    link_map = {}
    max_chunk_size = 1000000 if config.testing else 1000000000
    decompressor = lzma.LZMADecompressor()
    with open(link_file_name, 'rb') as link_file:
        data = link_file.read()
        # now, decompress a chunk at a time
        # if the line doesn't end in a carriage return, keep that last bit for the next line
        links = decompress_chunk(decompressor, data, encoding, max_chunk_size)
        count = 1
        link_line_failures = 0
        link_line_successes = 0
        while not decompressor.needs_input:
            print("reading decompressed lines {}".format(count))
            count += 1
            if config.testing and count > 2:
                break
            if not links:
                break
            for link_line in links:  # do I need to worry about incomplete lines?
                tup = tuple(entry for entry in link_line.split('\t'))
                if len(tup) > 1:
                    id_from_title = None
                    try:
                        id_from_title = int(resolved_title_to_id[tup[1]])
                    except:
                        if config.testing:
                            print ("tup {} from link_line {} broke somehow.".format(tup, link_line))
                        # print ("page id {}".format(page_title_to_id[tup[1]]))
                        link_line_failures += 1
                    if id_from_title:  # two try/excepts in case id_from_title is busted
                        try:
                            link_map[int(tup[0])] += [id_from_title]
                        except:
                            link_map[int(tup[0])] = [id_from_title]
                        link_line_successes += 1
            links = decompress_chunk(decompressor, decompressor.unused_data, encoding, max_chunk_size)
    print ("Link Files have been imported. Length of links: {}".format(len(link_map)))
    print ("Link line successes: {} failures: {}".format(link_line_successes, link_line_failures))
    return link_map