def main():
    print 'FlightDataInspector (c) Copyright 2013 Flight Data Services, Ltd.'
    print '  - Powered by POLARIS'
    print '  - http://www.flightdatacommunity.com'
    print ''

    parser = argparse.ArgumentParser()

    parser.add_argument('file_path')
    parser.add_argument('--words', action='store', default=16384, type=int,
                        help='Number of words to read from the file.')
    parser.add_argument('--debug', action='store_true',
                        help='Enable debug logging.')
    parser.add_argument('--check-sync', action='store_true',
                        help='Check sync in the whole data.')

    args = parser.parse_args()

    if args.debug:
        logger.setLevel(logging.DEBUG)

    if os.path.splitext(args.file_path)[1].lower() == '.bz2':
        file_obj = bz2.BZ2File(args.file_path)
    else:
        file_obj = open(args.file_path, 'rb')

    res = inspect(file_obj, args.words)

    if res and args.check_sync:
        wps, word_index, pattern_name = res
        file_obj.seek(0)
        check_sync(file_obj, wps, word_index, pattern_name)

    file_obj.close()
def decompress(source, target):
    logging.debug("Starting decompression of %s to %s", repr(source),
                  repr(target))
    with open(source, "rb") as fsource:
        logging.debug("Parsing header")
        magic, method, majorversion, minorversion, pagesize, uncompressed_size = util.parse_header(
            fsource)
        logging.debug("    Magic number: %s", repr(magic))
        logging.debug("    Method: %s", repr(method))
        logging.debug("    Major version number: %d", majorversion)
        logging.debug("    Minor version number: %d", minorversion)
        logging.debug("    Page size: %d", pagesize)
        logging.debug("    Uncompressed size: %d", uncompressed_size)
        with open(target, "wb") as ftarget:
            curr_size = 0.0
            pagecnt = 0
            with bz2file.BZ2File(filename=fsource, mode="rb",
                                 compresslevel=9) as fsource:
                while True:
                    if pagecnt % 100 == 0 or curr_size == uncompressed_size:
                        sys.stdout.write("\rProgress: {:.2f}%".format(
                            curr_size / uncompressed_size * 100))
                        sys.stdout.flush()
                    page = fsource.read(pagesize)
                    if not page:
                        break
                    ftarget.write(page)
                    curr_size += len(page)
                    pagecnt += 1
            sys.stdout.write("\n")
    logging.debug("Done")
Beispiel #3
0
def get_fileobj(filename,
                mode="r",
                gzip_only=False,
                bz2_only=False,
                zip_only=False):
    """
    Returns a fileobj. If the file is compressed, return appropriate file reader.

    :param filename: path to file that should be opened
    :param mode: mode to pass to opener
    :param gzip_only: only open file if file is gzip compressed or not compressed
    :param bz2_only: only open file if file is bz2 compressed or not compressed
    :param zip_only: only open file if file is zip compressed or not compressed
    """
    # the various compression readers don't support 'U' mode,
    # so we open in 'r'.
    if mode == 'U':
        cmode = 'r'
    else:
        cmode = mode
    if not bz2_only and not zip_only and is_gzip(filename):
        return gzip.GzipFile(filename, cmode)
    if not gzip_only and not zip_only and is_bz2(filename):
        return bz2.BZ2File(filename, cmode)
    if not bz2_only and not gzip_only and zipfile.is_zipfile(filename):
        # Return fileobj for the first file in a zip file.
        with zipfile.ZipFile(filename, cmode) as zh:
            return zh.open(zh.namelist()[0], cmode)
    return open(filename, mode)
Beispiel #4
0
def _guess_open(filename):
    """
    Make a best-effort guess as to how to parse the given sequence file.

    Handles '-' as shortcut for stdin.
    Deals with .gz and .bz2 as well as plain text.
    """
    magic_dict = {
        b"\x1f\x8b\x08": "gz",
        b"\x42\x5a\x68": "bz2",
    }  # Inspired by http://stackoverflow.com/a/13044946/1585509

    if filename == '-':
        filename = '/dev/stdin'

    bufferedfile = io.open(file=filename, mode='rb', buffering=8192)
    num_bytes_to_peek = max(len(x) for x in magic_dict)
    file_start = bufferedfile.peek(num_bytes_to_peek)
    compression = None
    for magic, ftype in magic_dict.items():
        if file_start.startswith(magic):
            compression = ftype
            break
    if compression is 'bz2':
        sigfile = bz2file.BZ2File(filename=bufferedfile)
    elif compression is 'gz':
        if not bufferedfile.seekable():
            bufferedfile.close()
            raise ValueError("gziped data not streamable, pipe through zcat \
                            first")
        sigfile = gzip.GzipFile(filename=filename)
    else:
        sigfile = bufferedfile

    return sigfile
Beispiel #5
0
def populate_database(database_filename, dump_filename):
    """Step 5.
    Read and parse the downloaded file, and every time an article is
    encountered, we insert it in the database.
    """
    logging.info("Populating database (there are ca. 4M pages)...")
    connection = sqlite3.connect(database_filename)
    cursor = connection.cursor()
    with bz2file.BZ2File(dump_filename) as xml_file:
        parser = xml.etree.ElementTree.iterparse(xml_file)
        pbar = tqdm.tqdm(unit="page")
        for event, element in parser:
            if event == "end" and element.tag == NS + "page":
                pbar.update(1)
                if element.find(NS + "ns").text != "0":
                    element.clear()
                    continue
                title = element.find(NS + "title").text
                content = element.find(NS + "revision").find(NS + "text").text
                if "== {{langue|fr}} ==" not in content:
                    element.clear()
                    continue
                clean_content = clear_article_content(content)
                cursor.execute(
                    """INSERT INTO entries (title, content) VALUES (?, ?)""",
                    (title, clean_content))
                element.clear()
        pbar.close()
    logging.info("Commiting database insertions...")
    connection.commit()
    connection.close()
Beispiel #6
0
    def parseWikipedia(self, inPath, outPath, titlesPath):
        assert inPath != outPath

        self.loadTitles(titlesPath)

        self.outFile = None
        if outPath:
            self.outFile = codecs.open(outPath, "wt", "utf-8")

        compressed = inPath.endswith(".bz2")
        originalFile = open(inPath, "r" if compressed else "rt")
        if inPath.endswith(".bz2"):
            f = bz2file.BZ2File(originalFile, mode="r")
        else:
            f = originalFile

        lineNum = 0
        c = codecs.iterdecode(f, "utf-8")
        for line in c:
            if lineNum % 100000 == 0:
                print "Processing line", lineNum, "title", (
                    self.numTitles, self.numSkipped), "=", self.title
            self.processLine(line)
            lineNum += 1

        originalFile.close()
        if self.outFile:
            self.outFile.close()
Beispiel #7
0
def multi_open(name):
    if name.endswith('.gz'):
        f = gzip.open(name)
    elif name.endswith('.bz2'):
        f = bz2.BZ2File(name)
    else:
        f = open(name)
    return f
def get_index(path):
    res = set()
    for line in bz2file.BZ2File(path):
        m = re.search(('(\d+)\:\d+:.+'), line)
        res.add(int(m.group(1)))
    res = list(sorted(res, key=int))
    res.append(-1)
    return res
Beispiel #9
0
    def open_reader(self, filename, *args, **kwargs):
        """
        Make a best-effort guess as to how to parse the given sequence file.

        Handles '-' as shortcut for stdin.
        Deals with .gz, FASTA, and FASTQ records.
        """
        magic_dict = {
            b"\x1f\x8b\x08": "gz",
            b"\x42\x5a\x68": "bz2",
            # "\x50\x4b\x03\x04": "zip"
        }  # Inspired by http://stackoverflow.com/a/13044946/1585509
        filename = _normalize_filename(filename)
        bufferedfile = io.open(file=filename, mode='rb', buffering=8192)
        num_bytes_to_peek = max(len(x) for x in magic_dict)
        file_start = bufferedfile.peek(num_bytes_to_peek)
        compression = None
        for magic, ftype in magic_dict.items():
            if file_start.startswith(magic):
                compression = ftype
                break
        if compression is 'bz2':
            sequencefile = bz2file.BZ2File(filename=bufferedfile)
            peek = sequencefile.peek(1)
        elif compression is 'gz':
            if not bufferedfile.seekable():
                bufferedfile.close()
                raise ValueError(
                    "gziped data not streamable, pipe through zcat \
                                first")
            peek = gzip.GzipFile(filename=filename).read(1)
            sequencefile = gzip.GzipFile(filename=filename)
        else:
            peek = bufferedfile.peek(1)
            sequencefile = bufferedfile

        iter_fn = None
        try:
            first_char = peek[0]
        except IndexError as err:
            return []  # empty file

        try:
            first_char = chr(first_char)
        except TypeError:
            pass

        if first_char == '>':
            iter_fn = fasta_iter
        elif first_char == '@':
            iter_fn = fastq_iter

        if iter_fn is None:
            raise ValueError("unknown file format for '%s'" % filename)

        self.sequencefile = sequencefile
        return iter_fn(sequencefile, *args, **kwargs)
Beispiel #10
0
def _open_bz2(filename, mode):
    if bz2 is None:
        raise ImportError("Cannot open bz2 files: The bz2 module is not available")
    if _PY3:
        return bz2.open(filename, mode)
    else:
        if mode[0] == 'a':
            raise ValueError("Mode '{}' not supported with BZ2 compression".format(mode))
        return bz2.BZ2File(filename, mode)
Beispiel #11
0
def xml_to_csv(filename):
    # Construct dump file iterator
    input_file = Dump.from_file(bz2file.BZ2File(filename))

    print("Processing...")
    # Open output file
    output_csv = open(filename[0:-3] + "2csv", 'w')

    # writing header for output csv file
    output_csv.write(";".join([
        "page_id", "page_title", "page_ns", "revision_id", "revision_parent",
        "timestamp", "contributor_id", "contributor_name", "comments", "model"
        "bytes"
    ]))
    output_csv.write("\n")
    # Iterate through pages
    par = tqdm.tqdm()
    for page in input_file.pages:
        par.update(1)
        # get page info
        page_id = str(page.id)
        page_title = '|{}|'.format(page.title)
        page_ns = str(page.namespace)
        if page_id == '12':
            for revision in page:
                if revision != None:
                    # get revision info
                    revision_id = str(revision.id)
                    if revision_id == '876580929':
                        text = str(revision.text)
                        revision_parent = '-1' if revision.parent_id == None else str(
                            revision.parent_id)
                        timestamp = str(revision.timestamp)
                        revision_bytes = '-1' if revision.bytes == None else str(
                            revision.bytes)

                        contributor_id = str(revision.user.id)
                        contributor_name = str(revision.user.text)

                        comment = str(revision.comment)
                        model = str(revision.model)

                        revision_row = [
                            page_id, page_title, page_ns, revision_id,
                            revision_parent, timestamp, contributor_id,
                            contributor_name, comment, model, revision_bytes,
                            text
                        ]
                        #~ print(revision_row)
                        output_csv.write(";".join(revision_row) + '\n')
                        return

    print("Done processing")
    output_csv.close()
    return True
Beispiel #12
0
    def __init__(self, input_file):
        self.filename = input_file
        self.indexed = False

        if input_file.strip() == "-":
            ifile = sys.stdin
        elif input_file.endswith(".bz2"):
            try:
                ifile = bz2file.BZ2File(input_file, "r", buffering=0)
            except Exception, e:
                raise e
Beispiel #13
0
def extract_bzip2(archive, compression, cmd, verbosity, interactive, outdir):
    """Extract a BZIP2 archive with the bz2 Python module."""
    targetname = util.get_single_outfile(outdir, archive)
    try:
        with bz2.BZ2File(archive) as bz2file:
            with open(targetname, 'wb') as targetfile:
                data = bz2file.read(READ_SIZE_BYTES)
                while data:
                    targetfile.write(data)
                    data = bz2file.read(READ_SIZE_BYTES)
    except Exception as err:
        msg = "error extracting %s to %s: %s" % (archive, targetname, err)
        raise util.PatoolError(msg)
    return None
def get_open(path, mode, file_type=None, encoding='utf-8'):
    def wrapper(opener):
        if 'r' in mode:
            return io.TextIOWrapper(io.BufferedReader(opener), encoding=encoding)
        else:
            return io.TextIOWrapper(opener, encoding=encoding)

    if file_type == 'gzip':
        return wrapper(gzip.GzipFile(path, mode))
    if file_type == 'bz2':
        import bz2file
        return wrapper(bz2file.BZ2File(path, mode))
    else:
        return io.open(path, mode, encoding=encoding)
Beispiel #15
0
 def testSaveAsJSONBzip2(self):
     """
     A DiamondTabularFormatReader must be able to save itself as bzip2'd
     JSON.
     """
     mockOpener = mockOpen(read_data=DIAMOND_RECORDS)
     with patch.object(builtins, 'open', mockOpener):
         reader = DiamondTabularFormatReader('file.txt')
         data = BytesIO()
         fp = bz2file.BZ2File(data, 'w')
         reader.saveAsJSON(fp, writeBytes=True)
         fp.close()
         self.assertEqual(compress(DIAMOND_RECORDS_DUMPED.encode('UTF-8')),
                          data.getvalue())
    def _read_json(self, path_or_url, compressed=True, advanced_path=False):
        ''' Load JSON for a path. Allows remote files in addition to local ones. '''
        if parse_url(path_or_url).scheme in ['http', 'https']:
            try:
                req = _urlopen(path_or_url)
                filename_or_buffer = BytesIO(req.read())
            except HTTPError:
                logging.exception("HTTP Error accessing %s" % path_or_url)
                raise
            compressed = False
        else:
            filename_or_buffer = path_or_url

        try:
            if compressed:
                f = bz2.BZ2File(filename_or_buffer)
            else:
                if (type(filename_or_buffer) != BytesIO) and not isinstance(
                        filename_or_buffer, StringIO):
                    f = codecs.open(filename_or_buffer, 'r+', encoding="utf-8")
                else:
                    f = filename_or_buffer
            rawjson = f.readline()
            f.close()
        except IOError:
            logging.exception(
                "Can't read %s. Did you pass the incorrect "
                "'compressed=' argument?", path_or_url)
            raise
        except:
            print(compressed, type(filename_or_buffer))
            logging.exception("Can't open %s", path_or_url)
            raise

        # This is a bandaid for schema version 2.0, not over-engineered
        # since upcoming releases of the extracted features
        # dataset won't keep the basic/advanced split

        try:
            # For Python3 compatibility, decode to str object
            if PY3 and (type(rawjson) != str):
                rawjson = rawjson.decode()
            volumejson = json.loads(rawjson)
        except:
            logging.exception(
                "Problem reading JSON for %s. One common reason"
                " for this error is an incorrect compressed= "
                "argument", path_or_url)
            raise
        return volumejson
Beispiel #17
0
def page_generator(tmp_dir, max_docs=None):
  doc = u""
  count = 0
  corpus_filepath = _maybe_download_corpus(tmp_dir)
  for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000):
    line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
    if not doc and line != u"  <page>\n":
      continue
    doc += line
    if line == u"  </page>\n":
      yield doc
      doc = u""
      count += 1
      if max_docs and count >= max_docs:
        break
Beispiel #18
0
def _unpack_zip(zipfile, all_tasks):
    fpath = '%s%s%s' % (settings.ZIP_DIR, os.sep, zipfile)
    try:
        b = bz2file.BZ2File(fpath)
        tar = tarfile.open(fileobj=b)
    except tarfile.ReadError:
        error("Could not read tarfile: %s" % fpath)
        return
    mkdir(settings.STAGE_DIR)
    tar.extractall(settings.STAGE_DIR)
    tar.close()
    move_results(all_tasks)
    ziplog = settings.ZIP_DIR + os.sep + 'abed_unzipped.txt'
    with open(ziplog, 'a') as fid:
        fid.write(zipfile + '\n')
Beispiel #19
0
def create_bzip2(archive, compression, cmd, verbosity, interactive, filenames):
    """Create a BZIP2 archive with the bz2 Python module."""
    if len(filenames) > 1:
        raise util.PatoolError(
            'multi-file compression not supported in Python bz2')
    try:
        with bz2.BZ2File(archive, 'wb') as bz2file:
            filename = filenames[0]
            with open(filename, 'rb') as srcfile:
                data = srcfile.read(READ_SIZE_BYTES)
                while data:
                    bz2file.write(data)
                    data = srcfile.read(READ_SIZE_BYTES)
    except Exception as err:
        msg = "error creating %s: %s" % (archive, err)
        raise util.PatoolError(msg)
    return None
def compress(source, target, pagesize=4096):
    logging.debug("Starting compression of %s to %s", repr(source),
                  repr(target))
    logging.debug("Page size: %d", pagesize)
    size = os.path.getsize(source)
    with open(target, "wb") as ftarget:
        ftarget.write(util.create_header("bzip2", size))
        with bz2file.BZ2File(filename=ftarget, mode="wb",
                             compresslevel=9) as ftarget:
            for i, page in enumerate(util.get_pages(source,
                                                    pagesize=pagesize)):
                if i % 100 == 0 or (i + 1) * pagesize == size:
                    sys.stdout.write("\rProgress: {:.2f}%".format(
                        float(i * pagesize) / size * 100))
                    sys.stdout.flush()
                ftarget.write(page)
    sys.stdout.write("\n")
    logging.debug("Done")
Beispiel #21
0
def check_bz2(file_path, check_content=True):
    try:
        with open(file_path, "rb") as temp:
            magic_check = temp.read(3)
        if magic_check != util.bz2_magic:
            return (False, False)
    except Exception:
        return (False, False)

    if not check_content:
        return (True, True)

    with bz2.BZ2File(file_path, mode='rb') as bzipped_file:
        chunk = bzipped_file.read(CHUNK_SIZE)
    # See if we have a compressed HTML file
    if check_html(chunk, file_path=False):
        return (True, False)
    return (True, True)
Beispiel #22
0
def handle_bz2(repository, uploaded_file_name):
    fd, uncompressed = tempfile.mkstemp(prefix='repo_%d_upload_bunzip2_' % repository.id,
                                        dir=os.path.dirname(uploaded_file_name),
                                        text=False)
    bzipped_file = bz2.BZ2File(uploaded_file_name, 'rb')
    while 1:
        try:
            chunk = bzipped_file.read(basic_util.CHUNK_SIZE)
        except IOError:
            os.close(fd)
            os.remove(uncompressed)
            log.exception('Problem uncompressing bz2 data "%s"', uploaded_file_name)
            return
        if not chunk:
            break
        os.write(fd, chunk)
    os.close(fd)
    bzipped_file.close()
    shutil.move(uncompressed, uploaded_file_name)
    def open(name):
        """
        Intended to be private to the class...

        A flexible open routine that can handle plain text files or
        files compressed with gzip or bzip2.  Only used for the
        input files. Output files are emitted uncompressed, until the
        tools in the next leg of the pipeline can work properly with
        compressed files.

        :param name: The filename to open.
        :return: A file object for the named file.
        """
        if name.endswith('.gz'):
            f = gzip.open(name)
        elif name.endswith('.bz2'):
            f = bz2.BZ2File(name)
        else:
            f = open(name)
        return f
Beispiel #24
0
def check_bz2(file_path, check_content=True):
    try:
        temp = open(file_path, "U")
        magic_check = temp.read(3)
        temp.close()
        if magic_check != util.bz2_magic:
            return (False, False)
    except:
        return (False, False)

    if not check_content:
        return (True, True)

    CHUNK_SIZE = 2**15  # reKb
    bzipped_file = bz2.BZ2File(file_path, mode='rb')
    chunk = bzipped_file.read(CHUNK_SIZE)
    bzipped_file.close()
    # See if we have a compressed HTML file
    if check_html(file_path, chunk=chunk):
        return (True, False)
    return (True, True)
Beispiel #25
0
def load_channels(sample, chr_list):

    prefix = ''
    channel_names = [
        'clipped_reads', 'clipped_read_distance', 'coverage',
        'split_read_distance'
    ]

    channel_data = defaultdict(dict)
    for chrom in chr_list:
        logging.info('Loading data for Chr%s' % chrom)
        for ch in channel_names:
            logging.info('Loading data for channel %s' % ch)
            suffix = '.npy.bz2' if ch == 'coverage' else '.pbz2'
            if HPC_MODE:
                filename = "/hpc/cog_bioinf/ridder/users/smehrem/breakpoint-pairs/NA12878_channel_data/" + ch + "/" + chrom + "_" + ch + suffix
            else:
                filename = "/home/cog/smehrem/MinorResearchInternship/NA12878/" + ch + "/" + '_'.join(
                    [chrom, ch + suffix])
            assert os.path.isfile(filename)

            logging.info('Reading %s for Chr%s' % (ch, chrom))
            with bz2file.BZ2File(filename, 'rb') as f:
                if suffix == '.npy.bz2':
                    channel_data[chrom][ch] = np.load(f)
                else:
                    channel_data[chrom][ch] = pickle.load(f)
            logging.info('End of reading')

        # unpack clipped_reads
        channel_data[chrom]['read_quality'], channel_data[chrom]['clipped_reads'], \
        channel_data[chrom]['clipped_reads_inversion'], channel_data[chrom]['clipped_reads_duplication'], \
        channel_data[chrom]['clipped_reads_translocation'] = channel_data[chrom]['clipped_reads']

        # unpack split_reads
        channel_data[chrom]['split_read_distance'], \
        channel_data[chrom]['split_reads'] = channel_data[chrom]['split_read_distance']

    return channel_data
Beispiel #26
0
def open_raw_data(filepath, binary=True):
    '''
    Open the input file which may be compressed.

    :param filepath: Path of raw data file which can either be zip, bz2 or uncompressed.
    :type filepath: str

    :returns: An opened file object.
    :rtype: file
    '''
    extension = os.path.splitext(filepath)[1].lower()

    if extension in {'.sac', '.zip'}:
        zf = zipfile.ZipFile(filepath, 'r')
        filenames = zf.namelist()
        if len(filenames) != 1:
            raise IOError('Zip files must contain only a single data file.')
        return zf.open(filenames[0])

    if extension in {'.bz2'}:
        return bz2.BZ2File(filepath, 'r')

    return open(filepath, 'rb' if binary else 'r')
Beispiel #27
0
def page_generator(tmp_dir, max_docs=None):
    """
  Generate cleaned wikipedia articles as a string.
  """
    doc = u""
    count = 0
    corpus_filepath = _maybe_download_corpus(tmp_dir)
    for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000):
        line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
        if not doc and line != u"  <page>\n":
            continue
        doc += line
        if line == u"  </page>\n":
            doc_text = _page_text(doc)
            if doc_text != None:
                parsed_text = mwparserfromhell.parse(doc_text) \
                  .strip_code(normalize=True, collapse=True)
                yield parsed_text

            doc = u""
            count += 1
            if max_docs and count >= max_docs:
                break
Beispiel #28
0
def get_fileobj(filename, mode="r", compressed_formats=None):
    """
    Returns a fileobj. If the file is compressed, return an appropriate file
    reader. In text mode, always use 'utf-8' encoding.

    :param filename: path to file that should be opened
    :param mode: mode to pass to opener
    :param compressed_formats: list of allowed compressed file formats among
      'bz2', 'gzip' and 'zip'. If left to None, all 3 formats are allowed
    """
    if compressed_formats is None:
        compressed_formats = ['bz2', 'gzip', 'zip']
    # Remove 't' from mode, which may cause an error for compressed files
    mode = mode.replace('t', '')
    # the various compression readers don't support 'U' mode,
    # so we open in 'r'.
    if mode == 'U':
        cmode = 'r'
    else:
        cmode = mode
    if 'gzip' in compressed_formats and is_gzip(filename):
        fh = gzip.GzipFile(filename, cmode)
    elif 'bz2' in compressed_formats and is_bz2(filename):
        fh = bz2.BZ2File(filename, cmode)
    elif 'zip' in compressed_formats and zipfile.is_zipfile(filename):
        # Return fileobj for the first file in a zip file.
        with zipfile.ZipFile(filename, cmode) as zh:
            fh = zh.open(zh.namelist()[0], cmode)
    elif 'b' in mode:
        return open(filename, mode)
    else:
        return io.open(filename, mode, encoding='utf-8')
    if 'b' not in mode:
        return io.TextIOWrapper(fh, encoding='utf-8')
    else:
        return fh
Beispiel #29
0
def inspect_pairs(candidate_pairs, outFile):

    final_pairs = set()

    # from bp1 point of view
    bp_dict = defaultdict(dict)
    bp_list = []
    for sv in candidate_pairs:
        bp1, bp2 = sv.tuple

        bp_id = bp1.id()
        bp2_id = '_'.join([bp2.chr, bp2.strand])
        if bp2_id not in bp_dict[bp_id]:
            bp_dict[bp_id] = defaultdict(list)
        bp_dict[bp_id][bp2_id].append(bp2.pos)
        bp_list.append(bp_id)

    bp_cnt = Counter(bp_list)
    min_support_bp = [k for (k, v) in bp_cnt.items() if v >= min_support]
    logging.info('Min %d supported positions bp1: %d/%d' %
                 (min_support, len(min_support_bp), len(bp_cnt)))
    for bp1_id in min_support_bp:
        bp1_chr, bp1_pos, bp1_strand = bp1_id.split('_')
        for bp2_id in bp_dict[bp1_id]:
            bp2_chr, bp2_strand = bp2_id.split('_')
            if len(bp_dict[bp1_id][bp2_id]) >= min_support:
                bp2_pos = max(
                    bp_dict[bp1_id][bp2_id]) if bp1_strand == '+' else min(
                        bp_dict[bp1_id][bp2_id])
                final_pairs.add(
                    StructuralVariant(
                        Breakpoint(bp1_chr, int(bp1_pos), bp1_strand),
                        Breakpoint(bp2_chr, int(bp2_pos), bp2_strand)))

    logging.info('Length of pair set after BP1 perspective: %d' %
                 len(final_pairs))

    # from bp2 point of view
    bp_dict = defaultdict(dict)
    bp_list = []
    for sv in candidate_pairs:
        bp1, bp2 = sv.tuple

        bp_id = bp2.id()
        bp1_id = '_'.join([bp1.chr, bp1.strand])
        if bp1_id not in bp_dict[bp_id]:
            bp_dict[bp_id] = defaultdict(list)
        bp_dict[bp_id][bp1_id].append(bp1.pos)
        bp_list.append(bp_id)

    bp_cnt = Counter(bp_list)
    min_support_bp = [k for (k, v) in bp_cnt.items() if v >= min_support]
    logging.info('Min %d supported positions bp2: %d/%d' %
                 (min_support, len(min_support_bp), len(bp_cnt)))
    for bp1_id in min_support_bp:
        bp1_chr, bp1_pos, bp1_strand = bp1_id.split('_')
        for bp2_id in bp_dict[bp1_id]:
            bp2_chr, bp2_strand = bp2_id.split('_')
            if len(bp_dict[bp1_id][bp2_id]) >= min_support:
                bp2_pos = max(
                    bp_dict[bp1_id][bp2_id]) if bp1_strand == '+' else min(
                        bp_dict[bp1_id][bp2_id])
                final_pairs.add(
                    StructuralVariant(
                        Breakpoint(bp1_chr, int(bp1_pos), bp1_strand),
                        Breakpoint(bp2_chr, int(bp2_pos), bp2_strand)))

    logging.info('Length of pair set after BP2 perspective: %d' %
                 len(final_pairs))

    # Write the output in pickle format
    with bz2file.BZ2File(outFile, 'wb') as f:
        pickle.dump(final_pairs, f)
def compress(source, target, reference, nointra, delta, inner, pagesize=4096):
    # some info
    logging.debug("Starting compression of %s to %s", repr(source),
                  repr(target))
    logging.debug("Page size: %d", pagesize)
    logging.debug("Reference dump: %s", reference)

    # pages + page numbers bookkeeping
    reference_pages, reference_pagenrs = [], {}
    for i, page in enumerate(util.get_pages(reference)):
        reference_pages.append(page)
        if page not in reference_pagenrs:
            reference_pagenrs[page] = i
    reference_pages_set = set(reference_pages)

    # find new + duplicatable pages
    dedups = dd(list)
    diffs = dd()
    diff_seen = set()
    if nointra:
        new_pagenrs = []
    else:
        new_pagenrs = dd(list)
    new_pages = []
    same_distinct, same_total = set(), 0
    source_pages = []
    for i, page in enumerate(util.get_pages(source)):
        source_pages.append(page)
        if reference_pages[i] != page:
            if page not in reference_pages_set:
                if delta is not None:
                    d = util.create_diff(reference_pages[i], page)
                    if d is not None:
                        diff_seen.add(page)
                        diffs[i] = d
                        continue
                if nointra:
                    new_pagenrs.append(i)
                else:
                    new_pagenrs[page].append(i)
                new_pages.append(page)
            else:
                dedups[page].append(i)
        else:
            same_total += 1
            same_distinct.add(page)
    source_pages_set = set(source_pages)
    newpagescnt = len(new_pages), len(set(new_pages))

    # intervalize
    if nointra:
        new_pagenrs = util.intervalize(new_pagenrs)
    else:
        new_pagenrs = {
            page: util.intervalize(new_pagenrs[page])
            for page in new_pagenrs
        }
    dedups = {page: util.intervalize(dedups[page]) for page in dedups}

    # write file
    util.create_dir(".tmp")
    tmphandle, tmpfile = tempfile.mkstemp(dir=".tmp")
    try:
        with open(tmpfile, "wb") as ftmp:
            ftmp.write(reference + "\x00")
            inorder = []
            seen = set()
            for page in reference_pages:
                if page in dedups and page not in seen:
                    inorder.append(page)
                    seen.add(page)
            util.create_pagenr_list(
                [reference_pagenrs[page] for page in inorder], ftmp)
            for page in inorder:
                ftmp.write(util.create_interval_list(dedups[page]))
            if delta is not None:
                util.create_pagenr_list(sorted(diffs), ftmp)
                for pagenr in sorted(diffs):
                    ftmp.write(diffs[pagenr])
            if nointra:
                ftmp.write(util.create_interval_list(new_pagenrs))
                for page in new_pages:
                    ftmp.write(page)
            else:
                ftmp.write(struct.pack("<I", len(new_pagenrs)))
                for page in new_pagenrs:
                    ftmp.write(util.create_interval_list(new_pagenrs[page]))
                for page in new_pagenrs:
                    ftmp.write(page)
        with open(tmpfile, "rb") as ftmp, open(target, "wb") as ftarget:
            ftarget.write(
                util.create_header(create_method_name(nointra, delta, inner),
                                   os.path.getsize(source)))
            ftarget.flush()
            if inner is None:
                shutil.copyfileobj(ftmp, ftarget)
            elif inner == "gzip":
                with gzip.GzipFile(fileobj=ftarget, mode="wb",
                                   compresslevel=9) as ftarget:
                    shutil.copyfileobj(ftmp, ftarget)
            elif inner == "bzip2":
                with bz2file.BZ2File(filename=ftarget,
                                     mode="wb",
                                     compresslevel=9) as ftarget:
                    shutil.copyfileobj(ftmp, ftarget)
            elif inner == "7zip":
                p = subprocess.Popen(
                    ["7za", "a", "-an", "-txz", "-mx=9", "-si", "-so", source],
                    stdin=ftmp,
                    stdout=ftarget,
                    stderr=subprocess.PIPE)
                p.communicate()
    finally:
        os.close(tmphandle)
        os.remove(tmpfile)

    # some info
    dedup_distinct = len(set(dedups.keys()) | same_distinct)
    dedup_total = same_total + sum(b - a + 1 for l in dedups.values()
                                   for a, b in l)
    logging.debug("Deduplicated pages at the same offset: %d/%d (%d/%d)",
                  same_total, len(source_pages), len(same_distinct),
                  len(source_pages_set))
    logging.debug("Deduplicated pages at different offsets: %d/%d (%d/%d)",
                  dedup_total - same_total, len(source_pages), len(dedups),
                  len(source_pages_set))
    logging.debug("Deduplicated pages in total: %d/%d (%d/%d)", dedup_total,
                  len(source_pages), dedup_distinct, len(source_pages_set))
    if delta is not None:
        logging.debug("Diffed pages: %d/%d (%d/%d)", len(diffs),
                      len(source_pages), len(diff_seen), len(source_pages_set))
    logging.debug("New pages: %d/%d (%d/%d)", newpagescnt[0],
                  len(source_pages), newpagescnt[1], len(source_pages_set))
    logging.debug("Done")

    return 0